xref: /titanic_51/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision fc51f9bbbff02dbd8c3adf640b1a184ceeb58fa5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/x86_archext.h>
31 #include <sys/machsystm.h>
32 #include <sys/x_call.h>
33 #include <sys/stat.h>
34 #include <sys/acpi/acpi.h>
35 #include <sys/acpica.h>
36 #include <sys/cpu_acpi.h>
37 #include <sys/cpu_idle.h>
38 #include <sys/cpupm.h>
39 #include <sys/hpet.h>
40 #include <sys/archsystm.h>
41 #include <vm/hat_i86.h>
42 #include <sys/dtrace.h>
43 #include <sys/sdt.h>
44 #include <sys/callb.h>
45 
46 #define	CSTATE_USING_HPET		1
47 #define	CSTATE_USING_LAT		2
48 
49 extern void cpu_idle_adaptive(void);
50 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
51     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
52 
53 static int cpu_idle_init(cpu_t *);
54 static void cpu_idle_fini(cpu_t *);
55 static boolean_t cpu_deep_idle_callb(void *arg, int code);
56 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
57 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
58 
59 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
60 
61 /*
62  * the flag of always-running local APIC timer.
63  * the flag of HPET Timer use in deep cstate.
64  */
65 static boolean_t cpu_cstate_arat = B_FALSE;
66 static boolean_t cpu_cstate_hpet = B_FALSE;
67 
68 /*
69  * Interfaces for modules implementing Intel's deep c-state.
70  */
71 cpupm_state_ops_t cpu_idle_ops = {
72 	"Generic ACPI C-state Support",
73 	cpu_idle_init,
74 	cpu_idle_fini,
75 	NULL
76 };
77 
78 static kmutex_t		cpu_idle_callb_mutex;
79 static callb_id_t	cpu_deep_idle_callb_id;
80 static callb_id_t	cpu_idle_cpr_callb_id;
81 static uint_t		cpu_idle_cfg_state;
82 
83 static kmutex_t cpu_idle_mutex;
84 
85 cpu_idle_kstat_t cpu_idle_kstat = {
86 	{ "address_space_id",	KSTAT_DATA_STRING },
87 	{ "latency",		KSTAT_DATA_UINT32 },
88 	{ "power",		KSTAT_DATA_UINT32 },
89 };
90 
91 /*
92  * kstat update function of the c-state info
93  */
94 static int
95 cpu_idle_kstat_update(kstat_t *ksp, int flag)
96 {
97 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
98 
99 	if (flag == KSTAT_WRITE) {
100 		return (EACCES);
101 	}
102 
103 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
104 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
105 		"FFixedHW");
106 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
107 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
108 		"SystemIO");
109 	} else {
110 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
111 		"Unsupported");
112 	}
113 
114 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
115 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
116 
117 	return (0);
118 }
119 
120 /*
121  * Used during configuration callbacks to manage implementation specific
122  * details of the hardware timer used during Deep C-state.
123  */
124 boolean_t
125 cstate_timer_callback(int code)
126 {
127 	if (cpu_cstate_arat) {
128 		return (B_TRUE);
129 	} else if (cpu_cstate_hpet) {
130 		return (hpet.callback(code));
131 	}
132 	return (B_FALSE);
133 }
134 
135 /*
136  * Some Local APIC Timers do not work during Deep C-states.
137  * The Deep C-state idle function uses this function to ensure it is using a
138  * hardware timer that works during Deep C-states.  This function also
139  * switches the timer back to the LACPI Timer after Deep C-state.
140  */
141 static boolean_t
142 cstate_use_timer(hrtime_t *lapic_expire, int timer)
143 {
144 	if (cpu_cstate_arat)
145 		return (B_TRUE);
146 
147 	/*
148 	 * We have to return B_FALSE if no arat or hpet support
149 	 */
150 	if (!cpu_cstate_hpet)
151 		return (B_FALSE);
152 
153 	switch (timer) {
154 	case CSTATE_USING_HPET:
155 		return (hpet.use_hpet_timer(lapic_expire));
156 	case CSTATE_USING_LAT:
157 		hpet.use_lapic_timer(*lapic_expire);
158 		return (B_TRUE);
159 	default:
160 		return (B_FALSE);
161 	}
162 }
163 
164 /*
165  * c-state wakeup function.
166  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
167  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
168  */
169 void
170 cstate_wakeup(cpu_t *cp, int bound)
171 {
172 	struct machcpu	*mcpu = &(cp->cpu_m);
173 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
174 	cpupart_t	*cpu_part;
175 	uint_t		cpu_found;
176 	processorid_t	cpu_sid;
177 
178 	cpu_part = cp->cpu_part;
179 	cpu_sid = cp->cpu_seqid;
180 	/*
181 	 * Clear the halted bit for that CPU since it will be woken up
182 	 * in a moment.
183 	 */
184 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
185 		/*
186 		 * Clear the halted bit for that CPU since it will be
187 		 * poked in a moment.
188 		 */
189 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
190 
191 		/*
192 		 * We may find the current CPU present in the halted cpuset
193 		 * if we're in the context of an interrupt that occurred
194 		 * before we had a chance to clear our bit in cpu_idle().
195 		 * Waking ourself is obviously unnecessary, since if
196 		 * we're here, we're not halted.
197 		 */
198 		if (cp != CPU) {
199 			/*
200 			 * Use correct wakeup mechanism
201 			 */
202 			if ((mcpu_mwait != NULL) &&
203 			    (*mcpu_mwait == MWAIT_HALTED))
204 				MWAIT_WAKEUP(cp);
205 			else
206 				poke_cpu(cp->cpu_id);
207 		}
208 		return;
209 	} else {
210 		/*
211 		 * This cpu isn't halted, but it's idle or undergoing a
212 		 * context switch. No need to awaken anyone else.
213 		 */
214 		if (cp->cpu_thread == cp->cpu_idle_thread ||
215 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
216 			return;
217 	}
218 
219 	/*
220 	 * No need to wake up other CPUs if the thread we just enqueued
221 	 * is bound.
222 	 */
223 	if (bound)
224 		return;
225 
226 
227 	/*
228 	 * See if there's any other halted CPUs. If there are, then
229 	 * select one, and awaken it.
230 	 * It's possible that after we find a CPU, somebody else
231 	 * will awaken it before we get the chance.
232 	 * In that case, look again.
233 	 */
234 	do {
235 		cpu_found = bitset_find(&cpu_part->cp_haltset);
236 		if (cpu_found == (uint_t)-1)
237 			return;
238 
239 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
240 	    cpu_found) < 0);
241 
242 	/*
243 	 * Must use correct wakeup mechanism to avoid lost wakeup of
244 	 * alternate cpu.
245 	 */
246 	if (cpu_found != CPU->cpu_seqid) {
247 		mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait;
248 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
249 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
250 		else
251 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
252 	}
253 }
254 
255 /*
256  * enter deep c-state handler
257  */
258 static void
259 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
260 {
261 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
262 	cpu_t			*cpup = CPU;
263 	processorid_t		cpu_sid = cpup->cpu_seqid;
264 	cpupart_t		*cp = cpup->cpu_part;
265 	hrtime_t		lapic_expire;
266 	uint8_t			type = cstate->cs_addrspace_id;
267 	uint32_t		cs_type = cstate->cs_type;
268 	int			hset_update = 1;
269 	boolean_t		using_timer;
270 
271 	/*
272 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
273 	 * wake us between now and when we call mwait.  No other cpu will
274 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
275 	 */
276 	if (mcpu_mwait) {
277 		if (type == ACPI_ADR_SPACE_SYSTEM_IO)
278 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
279 		else
280 			*mcpu_mwait = MWAIT_HALTED;
281 	}
282 
283 	/*
284 	 * If this CPU is online, and there are multiple CPUs
285 	 * in the system, then we should note our halting
286 	 * by adding ourselves to the partition's halted CPU
287 	 * bitmap. This allows other CPUs to find/awaken us when
288 	 * work becomes available.
289 	 */
290 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
291 		hset_update = 0;
292 
293 	/*
294 	 * Add ourselves to the partition's halted CPUs bitmask
295 	 * and set our HALTED flag, if necessary.
296 	 *
297 	 * When a thread becomes runnable, it is placed on the queue
298 	 * and then the halted cpuset is checked to determine who
299 	 * (if anyone) should be awakened. We therefore need to first
300 	 * add ourselves to the halted cpuset, and and then check if there
301 	 * is any work available.
302 	 *
303 	 * Note that memory barriers after updating the HALTED flag
304 	 * are not necessary since an atomic operation (updating the bitmap)
305 	 * immediately follows. On x86 the atomic operation acts as a
306 	 * memory barrier for the update of cpu_disp_flags.
307 	 */
308 	if (hset_update) {
309 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
310 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
311 	}
312 
313 	/*
314 	 * Check to make sure there's really nothing to do.
315 	 * Work destined for this CPU may become available after
316 	 * this check. We'll be notified through the clearing of our
317 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
318 	 *
319 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
320 	 */
321 	if (disp_anywork()) {
322 		if (hset_update) {
323 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
324 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
325 		}
326 		return;
327 	}
328 
329 	/*
330 	 * We're on our way to being halted.
331 	 *
332 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
333 	 * Try to program the HPET hardware to substitute for this CPU's
334 	 * LAPIC timer.
335 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
336 	 * to start the LAPIC Timer again before leaving this function.
337 	 *
338 	 * Disable interrupts here so we will awaken immediately after halting
339 	 * if someone tries to poke us between now and the time we actually
340 	 * halt.
341 	 */
342 	cli();
343 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
344 
345 	/*
346 	 * We check for the presence of our bit after disabling interrupts.
347 	 * If it's cleared, we'll return. If the bit is cleared after
348 	 * we check then the cstate_wakeup() will pop us out of the halted
349 	 * state.
350 	 *
351 	 * This means that the ordering of the cstate_wakeup() and the clearing
352 	 * of the bit by cpu_wakeup is important.
353 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
354 	 * cstate_wakeup().
355 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
356 	 */
357 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
358 		(void) cstate_use_timer(&lapic_expire,
359 		    CSTATE_USING_LAT);
360 		sti();
361 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
362 		return;
363 	}
364 
365 	/*
366 	 * The check for anything locally runnable is here for performance
367 	 * and isn't needed for correctness. disp_nrunnable ought to be
368 	 * in our cache still, so it's inexpensive to check, and if there
369 	 * is anything runnable we won't have to wait for the poke.
370 	 */
371 	if (cpup->cpu_disp->disp_nrunnable != 0) {
372 		(void) cstate_use_timer(&lapic_expire,
373 		    CSTATE_USING_LAT);
374 		sti();
375 		if (hset_update) {
376 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
377 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
378 		}
379 		return;
380 	}
381 
382 	if (using_timer == B_FALSE) {
383 
384 		(void) cstate_use_timer(&lapic_expire,
385 		    CSTATE_USING_LAT);
386 		sti();
387 
388 		/*
389 		 * We are currently unable to program the HPET to act as this
390 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
391 		 * because no timer is set to wake it up while its LAPIC timer
392 		 * stalls in deep C-States.
393 		 * Enter C1 instead.
394 		 *
395 		 * cstate_wake_cpu() will wake this CPU with an IPI which
396 		 * works with MWAIT.
397 		 */
398 		i86_monitor(mcpu_mwait, 0, 0);
399 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
400 			cpu_dtrace_idle_probe(CPU_ACPI_C1);
401 
402 			tlb_going_idle();
403 			i86_mwait(0, 0);
404 			tlb_service();
405 
406 			cpu_dtrace_idle_probe(CPU_ACPI_C0);
407 		}
408 
409 		/*
410 		 * We're no longer halted
411 		 */
412 		if (hset_update) {
413 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
414 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
415 		}
416 		return;
417 	}
418 
419 	cpu_dtrace_idle_probe((uint_t)cs_type);
420 
421 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
422 		/*
423 		 * We're on our way to being halted.
424 		 * To avoid a lost wakeup, arm the monitor before checking
425 		 * if another cpu wrote to mcpu_mwait to wake us up.
426 		 */
427 		i86_monitor(mcpu_mwait, 0, 0);
428 		if (*mcpu_mwait == MWAIT_HALTED) {
429 			uint32_t eax = cstate->cs_address;
430 			uint32_t ecx = 1;
431 
432 			tlb_going_idle();
433 			i86_mwait(eax, ecx);
434 			tlb_service();
435 		}
436 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
437 		uint32_t value;
438 		ACPI_TABLE_FADT *gbl_FADT;
439 
440 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
441 			tlb_going_idle();
442 			(void) cpu_acpi_read_port(cstate->cs_address,
443 			    &value, 8);
444 			acpica_get_global_FADT(&gbl_FADT);
445 			(void) cpu_acpi_read_port(
446 			    gbl_FADT->XPmTimerBlock.Address, &value, 32);
447 			tlb_service();
448 		}
449 	}
450 
451 	/*
452 	 * The LAPIC timer may have stopped in deep c-state.
453 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
454 	 */
455 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
456 	sti();
457 
458 	cpu_dtrace_idle_probe(CPU_ACPI_C0);
459 
460 	/*
461 	 * We're no longer halted
462 	 */
463 	if (hset_update) {
464 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
465 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
466 	}
467 }
468 
469 /*
470  * indicate when bus masters are active
471  */
472 static uint32_t
473 cpu_acpi_bm_sts(void)
474 {
475 	uint32_t bm_sts = 0;
476 
477 	cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts);
478 
479 	if (bm_sts)
480 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
481 
482 	return (bm_sts);
483 }
484 
485 /*
486  * Idle the present CPU, deep c-state is supported
487  */
488 void
489 cpu_acpi_idle(void)
490 {
491 	cpu_t *cp = CPU;
492 	cpu_acpi_handle_t handle;
493 	cma_c_state_t *cs_data;
494 	cpu_acpi_cstate_t *cstates;
495 	hrtime_t start, end;
496 	int cpu_max_cstates;
497 	uint32_t cs_indx;
498 	uint16_t cs_type;
499 
500 	cpupm_mach_state_t *mach_state =
501 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
502 	handle = mach_state->ms_acpi_handle;
503 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
504 
505 	cs_data = mach_state->ms_cstate.cma_state.cstate;
506 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
507 	ASSERT(cstates != NULL);
508 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
509 	if (cpu_max_cstates > CPU_MAX_CSTATES)
510 		cpu_max_cstates = CPU_MAX_CSTATES;
511 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
512 		(*non_deep_idle_cpu)();
513 		return;
514 	}
515 
516 	start = gethrtime_unscaled();
517 
518 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
519 
520 	/*
521 	 * OSPM uses the BM_STS bit to determine the power state to enter
522 	 * when considering a transition to or from the C2/C3 power state.
523 	 * if C3 is determined, bus master activity demotes the power state
524 	 * to C2.
525 	 */
526 	if ((cstates[cs_indx].cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts())
527 		--cs_indx;
528 	cs_type = cstates[cs_indx].cs_type;
529 
530 	/*
531 	 * BM_RLD determines if the Cx power state was exited as a result of
532 	 * bus master requests. Set this bit when using a C3 power state, and
533 	 * clear it when using a C1 or C2 power state.
534 	 */
535 	if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) {
536 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
537 		CPU_ACPI_BM_INFO(handle) &= ~BM_RLD;
538 	}
539 
540 	if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) &&
541 	    (cs_type >= CPU_ACPI_C3)) {
542 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
543 		CPU_ACPI_BM_INFO(handle) |= BM_RLD;
544 	}
545 
546 	switch (cs_type) {
547 	default:
548 		/* FALLTHROUGH */
549 	case CPU_ACPI_C1:
550 		(*non_deep_idle_cpu)();
551 		break;
552 
553 	case CPU_ACPI_C2:
554 		acpi_cpu_cstate(&cstates[cs_indx]);
555 		break;
556 
557 	case CPU_ACPI_C3:
558 		/*
559 		 * recommended in ACPI spec, providing hardware mechanisms
560 		 * to prevent master from writing to memory (UP-only)
561 		 */
562 		if ((ncpus_online == 1) &&
563 		    (CPU_ACPI_BM_INFO(handle) & BM_CTL)) {
564 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
565 			CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS;
566 		/*
567 		 * Today all Intel's processor support C3 share cache.
568 		 */
569 		} else if (x86_vendor != X86_VENDOR_Intel) {
570 			__acpi_wbinvd();
571 		}
572 		acpi_cpu_cstate(&cstates[cs_indx]);
573 		if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) {
574 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
575 			CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS;
576 		}
577 		break;
578 	}
579 
580 	end = gethrtime_unscaled();
581 
582 	/*
583 	 * Update statistics
584 	 */
585 	cpupm_wakeup_cstate_data(cs_data, end);
586 }
587 
588 boolean_t
589 cpu_deep_cstates_supported(void)
590 {
591 	extern int	idle_cpu_no_deep_c;
592 
593 	if (idle_cpu_no_deep_c)
594 		return (B_FALSE);
595 
596 	if (!cpuid_deep_cstates_supported())
597 		return (B_FALSE);
598 
599 	if (cpuid_arat_supported()) {
600 		cpu_cstate_arat = B_TRUE;
601 		return (B_TRUE);
602 	}
603 
604 	if ((hpet.supported == HPET_FULL_SUPPORT) &&
605 	    hpet.install_proxy()) {
606 		cpu_cstate_hpet = B_TRUE;
607 		return (B_TRUE);
608 	}
609 
610 	return (B_FALSE);
611 }
612 
613 /*
614  * Validate that this processor supports deep cstate and if so,
615  * get the c-state data from ACPI and cache it.
616  */
617 static int
618 cpu_idle_init(cpu_t *cp)
619 {
620 	cpupm_mach_state_t *mach_state =
621 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
622 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
623 	cpu_acpi_cstate_t *cstate;
624 	char name[KSTAT_STRLEN];
625 	int cpu_max_cstates, i;
626 	ACPI_TABLE_FADT *gbl_FADT;
627 
628 	/*
629 	 * Cache the C-state specific ACPI data.
630 	 */
631 	if (cpu_acpi_cache_cstate_data(handle) != 0) {
632 		cmn_err(CE_NOTE,
633 		    "!cpu_idle_init: Failed to cache ACPI C-state data\n");
634 		cpu_idle_fini(cp);
635 		return (-1);
636 	}
637 
638 	/*
639 	 * Check the bus master arbitration control ability.
640 	 */
641 	acpica_get_global_FADT(&gbl_FADT);
642 	if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength)
643 		CPU_ACPI_BM_INFO(handle) |= BM_CTL;
644 
645 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
646 
647 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
648 
649 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
650 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
651 		/*
652 		 * Allocate, initialize and install cstate kstat
653 		 */
654 		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
655 		    name, "misc",
656 		    KSTAT_TYPE_NAMED,
657 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
658 		    KSTAT_FLAG_VIRTUAL);
659 
660 		if (cstate->cs_ksp == NULL) {
661 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
662 		} else {
663 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
664 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
665 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
666 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
667 			cstate->cs_ksp->ks_private = cstate;
668 			kstat_install(cstate->cs_ksp);
669 			cstate++;
670 		}
671 	}
672 
673 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
674 	cpupm_alloc_ms_cstate(cp);
675 
676 	if (cpu_deep_cstates_supported()) {
677 		mutex_enter(&cpu_idle_callb_mutex);
678 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
679 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
680 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
681 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
682 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
683 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
684 		mutex_exit(&cpu_idle_callb_mutex);
685 	}
686 
687 	return (0);
688 }
689 
690 /*
691  * Free resources allocated by cpu_idle_init().
692  */
693 static void
694 cpu_idle_fini(cpu_t *cp)
695 {
696 	cpupm_mach_state_t *mach_state =
697 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
698 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
699 	cpu_acpi_cstate_t *cstate;
700 	uint_t	cpu_max_cstates, i;
701 
702 	/*
703 	 * idle cpu points back to the generic one
704 	 */
705 	idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
706 	disp_enq_thread = non_deep_idle_disp_enq_thread;
707 
708 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
709 	if (cstate) {
710 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
711 
712 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
713 			if (cstate->cs_ksp != NULL)
714 				kstat_delete(cstate->cs_ksp);
715 			cstate++;
716 		}
717 	}
718 
719 	cpupm_free_ms_cstate(cp);
720 	cpupm_free_domains(&cpupm_cstate_domains);
721 	cpu_acpi_free_cstate_data(handle);
722 
723 	mutex_enter(&cpu_idle_callb_mutex);
724 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
725 		(void) callb_delete(cpu_deep_idle_callb_id);
726 		cpu_deep_idle_callb_id = (callb_id_t)0;
727 	}
728 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
729 		(void) callb_delete(cpu_idle_cpr_callb_id);
730 		cpu_idle_cpr_callb_id = (callb_id_t)0;
731 	}
732 	mutex_exit(&cpu_idle_callb_mutex);
733 }
734 
735 /*ARGSUSED*/
736 static boolean_t
737 cpu_deep_idle_callb(void *arg, int code)
738 {
739 	boolean_t rslt = B_TRUE;
740 
741 	mutex_enter(&cpu_idle_callb_mutex);
742 	switch (code) {
743 	case PM_DEFAULT_CPU_DEEP_IDLE:
744 		/*
745 		 * Default policy is same as enable
746 		 */
747 		/*FALLTHROUGH*/
748 	case PM_ENABLE_CPU_DEEP_IDLE:
749 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
750 			break;
751 
752 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
753 			disp_enq_thread = cstate_wakeup;
754 			idle_cpu = cpu_idle_adaptive;
755 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
756 		} else {
757 			rslt = B_FALSE;
758 		}
759 		break;
760 
761 	case PM_DISABLE_CPU_DEEP_IDLE:
762 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
763 			break;
764 
765 		idle_cpu = non_deep_idle_cpu;
766 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
767 			disp_enq_thread = non_deep_idle_disp_enq_thread;
768 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
769 		}
770 		break;
771 
772 	default:
773 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
774 		    code);
775 		break;
776 	}
777 	mutex_exit(&cpu_idle_callb_mutex);
778 	return (rslt);
779 }
780 
781 /*ARGSUSED*/
782 static boolean_t
783 cpu_idle_cpr_callb(void *arg, int code)
784 {
785 	boolean_t rslt = B_TRUE;
786 
787 	mutex_enter(&cpu_idle_callb_mutex);
788 	switch (code) {
789 	case CB_CODE_CPR_RESUME:
790 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
791 			/*
792 			 * Do not enable dispatcher hooks if disabled by user.
793 			 */
794 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
795 				break;
796 
797 			disp_enq_thread = cstate_wakeup;
798 			idle_cpu = cpu_idle_adaptive;
799 		} else {
800 			rslt = B_FALSE;
801 		}
802 		break;
803 
804 	case CB_CODE_CPR_CHKPT:
805 		idle_cpu = non_deep_idle_cpu;
806 		disp_enq_thread = non_deep_idle_disp_enq_thread;
807 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
808 		break;
809 
810 	default:
811 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
812 		break;
813 	}
814 	mutex_exit(&cpu_idle_callb_mutex);
815 	return (rslt);
816 }
817 
818 /*
819  * handle _CST notification
820  */
821 void
822 cpuidle_cstate_instance(cpu_t *cp)
823 {
824 #ifndef	__xpv
825 	cpupm_mach_state_t	*mach_state =
826 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
827 	cpu_acpi_handle_t	handle;
828 	struct machcpu		*mcpu;
829 	cpuset_t 		dom_cpu_set;
830 	kmutex_t		*pm_lock;
831 	int			result = 0;
832 	processorid_t		cpu_id;
833 
834 	if (mach_state == NULL) {
835 		return;
836 	}
837 
838 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
839 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
840 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
841 
842 	/*
843 	 * Do for all the CPU's in the domain
844 	 */
845 	mutex_enter(pm_lock);
846 	do {
847 		CPUSET_FIND(dom_cpu_set, cpu_id);
848 		if (cpu_id == CPUSET_NOTINSET)
849 			break;
850 
851 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
852 		cp = cpu[cpu_id];
853 		mach_state = (cpupm_mach_state_t *)
854 		    cp->cpu_m.mcpu_pm_mach_state;
855 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
856 			mutex_exit(pm_lock);
857 			return;
858 		}
859 		handle = mach_state->ms_acpi_handle;
860 		ASSERT(handle != NULL);
861 
862 		/*
863 		 * re-evaluate cstate object
864 		 */
865 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
866 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
867 			    " object Instance: %d", cpu_id);
868 		}
869 		mutex_enter(&cpu_lock);
870 		mcpu = &(cp->cpu_m);
871 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
872 		if (mcpu->max_cstates > CPU_ACPI_C1) {
873 			(void) cstate_timer_callback(
874 			    CST_EVENT_MULTIPLE_CSTATES);
875 			disp_enq_thread = cstate_wakeup;
876 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
877 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
878 			disp_enq_thread = non_deep_idle_disp_enq_thread;
879 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
880 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
881 		}
882 		mutex_exit(&cpu_lock);
883 
884 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
885 		mutex_exit(pm_lock);
886 	} while (result < 0);
887 #endif
888 }
889 
890 /*
891  * handle the number or the type of available processor power states change
892  */
893 void
894 cpuidle_manage_cstates(void *ctx)
895 {
896 	cpu_t			*cp = ctx;
897 	processorid_t		cpu_id = cp->cpu_id;
898 	cpupm_mach_state_t	*mach_state =
899 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
900 	boolean_t		is_ready;
901 
902 	if (mach_state == NULL) {
903 		return;
904 	}
905 
906 	/*
907 	 * We currently refuse to power manage if the CPU is not ready to
908 	 * take cross calls (cross calls fail silently if CPU is not ready
909 	 * for it).
910 	 *
911 	 * Additionally, for x86 platforms we cannot power manage
912 	 * any one instance, until all instances have been initialized.
913 	 * That's because we don't know what the CPU domains look like
914 	 * until all instances have been initialized.
915 	 */
916 	is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_cstate_ready();
917 	if (!is_ready)
918 		return;
919 
920 	cpuidle_cstate_instance(cp);
921 }
922