xref: /titanic_44/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision 73a0bd151c1115bf39cc2caa30c7cbfdd86361c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/x86_archext.h>
27 #include <sys/machsystm.h>
28 #include <sys/x_call.h>
29 #include <sys/stat.h>
30 #include <sys/acpi/acpi.h>
31 #include <sys/acpica.h>
32 #include <sys/cpu_acpi.h>
33 #include <sys/cpu_idle.h>
34 #include <sys/cpupm.h>
35 #include <sys/hpet.h>
36 #include <sys/archsystm.h>
37 #include <vm/hat_i86.h>
38 #include <sys/dtrace.h>
39 #include <sys/sdt.h>
40 #include <sys/callb.h>
41 
42 extern void cpu_idle_adaptive(void);
43 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
44     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
45 
46 static int cpu_idle_init(cpu_t *);
47 static void cpu_idle_fini(cpu_t *);
48 static boolean_t cpu_deep_idle_callb(void *arg, int code);
49 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
50 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
51 
52 /*
53  * Interfaces for modules implementing Intel's deep c-state.
54  */
55 cpupm_state_ops_t cpu_idle_ops = {
56 	"Generic ACPI C-state Support",
57 	cpu_idle_init,
58 	cpu_idle_fini,
59 	NULL
60 };
61 
62 static kmutex_t		cpu_idle_callb_mutex;
63 static callb_id_t	cpu_deep_idle_callb_id;
64 static callb_id_t	cpu_idle_cpr_callb_id;
65 static uint_t		cpu_idle_cfg_state;
66 
67 static kmutex_t cpu_idle_mutex;
68 
69 cpu_idle_kstat_t cpu_idle_kstat = {
70 	{ "address_space_id",	KSTAT_DATA_STRING },
71 	{ "latency",		KSTAT_DATA_UINT32 },
72 	{ "power",		KSTAT_DATA_UINT32 },
73 };
74 
75 /*
76  * kstat update function of the c-state info
77  */
78 static int
79 cpu_idle_kstat_update(kstat_t *ksp, int flag)
80 {
81 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
82 
83 	if (flag == KSTAT_WRITE) {
84 		return (EACCES);
85 	}
86 
87 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
88 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
89 		"FFixedHW");
90 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
91 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
92 		"SystemIO");
93 	} else {
94 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
95 		"Unsupported");
96 	}
97 
98 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
99 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
100 
101 	return (0);
102 }
103 
104 /*
105  * c-state wakeup function.
106  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
107  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
108  */
109 void
110 cstate_wakeup(cpu_t *cp, int bound)
111 {
112 	struct machcpu	*mcpu = &(cp->cpu_m);
113 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
114 	cpupart_t	*cpu_part;
115 	uint_t		cpu_found;
116 	processorid_t	cpu_sid;
117 
118 	cpu_part = cp->cpu_part;
119 	cpu_sid = cp->cpu_seqid;
120 	/*
121 	 * Clear the halted bit for that CPU since it will be woken up
122 	 * in a moment.
123 	 */
124 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
125 		/*
126 		 * Clear the halted bit for that CPU since it will be
127 		 * poked in a moment.
128 		 */
129 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
130 
131 		/*
132 		 * We may find the current CPU present in the halted cpuset
133 		 * if we're in the context of an interrupt that occurred
134 		 * before we had a chance to clear our bit in cpu_idle().
135 		 * Waking ourself is obviously unnecessary, since if
136 		 * we're here, we're not halted.
137 		 */
138 		if (cp != CPU) {
139 			/*
140 			 * Use correct wakeup mechanism
141 			 */
142 			if ((mcpu_mwait != NULL) &&
143 			    (*mcpu_mwait == MWAIT_HALTED))
144 				MWAIT_WAKEUP(cp);
145 			else
146 				poke_cpu(cp->cpu_id);
147 		}
148 		return;
149 	} else {
150 		/*
151 		 * This cpu isn't halted, but it's idle or undergoing a
152 		 * context switch. No need to awaken anyone else.
153 		 */
154 		if (cp->cpu_thread == cp->cpu_idle_thread ||
155 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
156 			return;
157 	}
158 
159 	/*
160 	 * No need to wake up other CPUs if the thread we just enqueued
161 	 * is bound.
162 	 */
163 	if (bound)
164 		return;
165 
166 
167 	/*
168 	 * See if there's any other halted CPUs. If there are, then
169 	 * select one, and awaken it.
170 	 * It's possible that after we find a CPU, somebody else
171 	 * will awaken it before we get the chance.
172 	 * In that case, look again.
173 	 */
174 	do {
175 		cpu_found = bitset_find(&cpu_part->cp_haltset);
176 		if (cpu_found == (uint_t)-1)
177 			return;
178 
179 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
180 	    cpu_found) < 0);
181 
182 	/*
183 	 * Must use correct wakeup mechanism to avoid lost wakeup of
184 	 * alternate cpu.
185 	 */
186 	if (cpu_found != CPU->cpu_seqid) {
187 		mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait;
188 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
189 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
190 		else
191 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
192 	}
193 }
194 
195 /*
196  * enter deep c-state handler
197  */
198 static void
199 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
200 {
201 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
202 	cpu_t			*cpup = CPU;
203 	processorid_t		cpu_sid = cpup->cpu_seqid;
204 	cpupart_t		*cp = cpup->cpu_part;
205 	hrtime_t		lapic_expire;
206 	uint8_t			type = cstate->cs_addrspace_id;
207 	uint32_t		cs_type = cstate->cs_type;
208 	int			hset_update = 1;
209 	boolean_t		using_hpet_timer;
210 
211 	/*
212 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
213 	 * wake us between now and when we call mwait.  No other cpu will
214 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
215 	 */
216 	if (mcpu_mwait) {
217 		if (type == ACPI_ADR_SPACE_SYSTEM_IO)
218 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
219 		else
220 			*mcpu_mwait = MWAIT_HALTED;
221 	}
222 
223 	/*
224 	 * If this CPU is online, and there are multiple CPUs
225 	 * in the system, then we should note our halting
226 	 * by adding ourselves to the partition's halted CPU
227 	 * bitmap. This allows other CPUs to find/awaken us when
228 	 * work becomes available.
229 	 */
230 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
231 		hset_update = 0;
232 
233 	/*
234 	 * Add ourselves to the partition's halted CPUs bitmask
235 	 * and set our HALTED flag, if necessary.
236 	 *
237 	 * When a thread becomes runnable, it is placed on the queue
238 	 * and then the halted cpuset is checked to determine who
239 	 * (if anyone) should be awakened. We therefore need to first
240 	 * add ourselves to the halted cpuset, and and then check if there
241 	 * is any work available.
242 	 *
243 	 * Note that memory barriers after updating the HALTED flag
244 	 * are not necessary since an atomic operation (updating the bitmap)
245 	 * immediately follows. On x86 the atomic operation acts as a
246 	 * memory barrier for the update of cpu_disp_flags.
247 	 */
248 	if (hset_update) {
249 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
250 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
251 	}
252 
253 	/*
254 	 * Check to make sure there's really nothing to do.
255 	 * Work destined for this CPU may become available after
256 	 * this check. We'll be notified through the clearing of our
257 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
258 	 *
259 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
260 	 */
261 	if (disp_anywork()) {
262 		if (hset_update) {
263 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
264 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
265 		}
266 		return;
267 	}
268 
269 	/*
270 	 * We're on our way to being halted.
271 	 *
272 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
273 	 * Program the HPET hardware to substitute for this CPU's lAPIC timer.
274 	 * hpet.use_hpet_timer() disables the LAPIC Timer.  Make sure to
275 	 * start the LAPIC Timer again before leaving this function.
276 	 *
277 	 * hpet.use_hpet_timer disables interrupts, so we will awaken
278 	 * immediately after halting if someone tries to poke us between now
279 	 * and the time we actually halt.
280 	 */
281 	using_hpet_timer = hpet.use_hpet_timer(&lapic_expire);
282 
283 	/*
284 	 * We check for the presence of our bit after disabling interrupts.
285 	 * If it's cleared, we'll return. If the bit is cleared after
286 	 * we check then the cstate_wakeup() will pop us out of the halted
287 	 * state.
288 	 *
289 	 * This means that the ordering of the cstate_wakeup() and the clearing
290 	 * of the bit by cpu_wakeup is important.
291 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
292 	 * cstate_wakeup().
293 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
294 	 */
295 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
296 		hpet.use_lapic_timer(lapic_expire);
297 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
298 		return;
299 	}
300 
301 	/*
302 	 * The check for anything locally runnable is here for performance
303 	 * and isn't needed for correctness. disp_nrunnable ought to be
304 	 * in our cache still, so it's inexpensive to check, and if there
305 	 * is anything runnable we won't have to wait for the poke.
306 	 */
307 	if (cpup->cpu_disp->disp_nrunnable != 0) {
308 		hpet.use_lapic_timer(lapic_expire);
309 		if (hset_update) {
310 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
311 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
312 		}
313 		return;
314 	}
315 
316 	if (using_hpet_timer == B_FALSE) {
317 
318 		hpet.use_lapic_timer(lapic_expire);
319 
320 		/*
321 		 * We are currently unable to program the HPET to act as this
322 		 * CPU's proxy lAPIC timer.  This CPU cannot enter C2 or deeper
323 		 * because no timer is set to wake it up while its lAPIC timer
324 		 * stalls in deep C-States.
325 		 * Enter C1 instead.
326 		 *
327 		 * cstate_wake_cpu() will wake this CPU with an IPI which
328 		 * works with MWAIT.
329 		 */
330 		i86_monitor(mcpu_mwait, 0, 0);
331 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
332 			cpu_dtrace_idle_probe(CPU_ACPI_C1);
333 
334 			tlb_going_idle();
335 			i86_mwait(0, 0);
336 			tlb_service();
337 
338 			cpu_dtrace_idle_probe(CPU_ACPI_C0);
339 		}
340 
341 		/*
342 		 * We're no longer halted
343 		 */
344 		if (hset_update) {
345 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
346 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
347 		}
348 		return;
349 	}
350 
351 	cpu_dtrace_idle_probe((uint_t)cs_type);
352 
353 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
354 		/*
355 		 * We're on our way to being halted.
356 		 * To avoid a lost wakeup, arm the monitor before checking
357 		 * if another cpu wrote to mcpu_mwait to wake us up.
358 		 */
359 		i86_monitor(mcpu_mwait, 0, 0);
360 		if (*mcpu_mwait == MWAIT_HALTED) {
361 			uint32_t eax = cstate->cs_address;
362 			uint32_t ecx = 1;
363 
364 			tlb_going_idle();
365 			i86_mwait(eax, ecx);
366 			tlb_service();
367 		}
368 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
369 		uint32_t value;
370 		ACPI_TABLE_FADT *gbl_FADT;
371 
372 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
373 			tlb_going_idle();
374 			(void) cpu_acpi_read_port(cstate->cs_address,
375 			    &value, 8);
376 			acpica_get_global_FADT(&gbl_FADT);
377 			(void) cpu_acpi_read_port(
378 			    gbl_FADT->XPmTimerBlock.Address, &value, 32);
379 			tlb_service();
380 		}
381 	}
382 
383 	/*
384 	 * The lAPIC timer may have stopped in deep c-state.
385 	 * Reprogram this CPU's lAPIC here before enabling interrupts.
386 	 */
387 	hpet.use_lapic_timer(lapic_expire);
388 
389 	cpu_dtrace_idle_probe(CPU_ACPI_C0);
390 
391 	/*
392 	 * We're no longer halted
393 	 */
394 	if (hset_update) {
395 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
396 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
397 	}
398 }
399 
400 /*
401  * indicate when bus masters are active
402  */
403 static uint32_t
404 cpu_acpi_bm_sts(void)
405 {
406 	uint32_t bm_sts = 0;
407 
408 	cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts);
409 
410 	if (bm_sts)
411 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
412 
413 	return (bm_sts);
414 }
415 
416 /*
417  * Idle the present CPU, deep c-state is supported
418  */
419 void
420 cpu_acpi_idle(void)
421 {
422 	cpu_t *cp = CPU;
423 	cpu_acpi_handle_t handle;
424 	cma_c_state_t *cs_data;
425 	cpu_acpi_cstate_t *cstates;
426 	hrtime_t start, end;
427 	int cpu_max_cstates;
428 	uint32_t cs_indx;
429 	uint16_t cs_type;
430 
431 	cpupm_mach_state_t *mach_state =
432 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
433 	handle = mach_state->ms_acpi_handle;
434 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
435 
436 	cs_data = mach_state->ms_cstate.cma_state.cstate;
437 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
438 	ASSERT(cstates != NULL);
439 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
440 	if (cpu_max_cstates > CPU_MAX_CSTATES)
441 		cpu_max_cstates = CPU_MAX_CSTATES;
442 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
443 		(*non_deep_idle_cpu)();
444 		return;
445 	}
446 
447 	start = gethrtime_unscaled();
448 
449 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
450 
451 	/*
452 	 * OSPM uses the BM_STS bit to determine the power state to enter
453 	 * when considering a transition to or from the C2/C3 power state.
454 	 * if C3 is determined, bus master activity demotes the power state
455 	 * to C2.
456 	 */
457 	if ((cstates[cs_indx].cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts())
458 		--cs_indx;
459 	cs_type = cstates[cs_indx].cs_type;
460 
461 	/*
462 	 * BM_RLD determines if the Cx power state was exited as a result of
463 	 * bus master requests. Set this bit when using a C3 power state, and
464 	 * clear it when using a C1 or C2 power state.
465 	 */
466 	if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) {
467 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
468 		CPU_ACPI_BM_INFO(handle) &= ~BM_RLD;
469 	}
470 
471 	if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) &&
472 	    (cs_type >= CPU_ACPI_C3)) {
473 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
474 		CPU_ACPI_BM_INFO(handle) |= BM_RLD;
475 	}
476 
477 	switch (cs_type) {
478 	default:
479 		/* FALLTHROUGH */
480 	case CPU_ACPI_C1:
481 		(*non_deep_idle_cpu)();
482 		break;
483 
484 	case CPU_ACPI_C2:
485 		acpi_cpu_cstate(&cstates[cs_indx]);
486 		break;
487 
488 	case CPU_ACPI_C3:
489 		/*
490 		 * recommended in ACPI spec, providing hardware mechanisms
491 		 * to prevent master from writing to memory (UP-only)
492 		 */
493 		if ((ncpus_online == 1) &&
494 		    (CPU_ACPI_BM_INFO(handle) & BM_CTL)) {
495 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
496 			CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS;
497 		/*
498 		 * Today all Intel's processor support C3 share cache.
499 		 */
500 		} else if (x86_vendor != X86_VENDOR_Intel) {
501 			__acpi_wbinvd();
502 		}
503 		acpi_cpu_cstate(&cstates[cs_indx]);
504 		if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) {
505 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
506 			CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS;
507 		}
508 		break;
509 	}
510 
511 	end = gethrtime_unscaled();
512 
513 	/*
514 	 * Update statistics
515 	 */
516 	cpupm_wakeup_cstate_data(cs_data, end);
517 }
518 
519 boolean_t
520 cpu_deep_cstates_supported(void)
521 {
522 	extern int	idle_cpu_no_deep_c;
523 
524 	if (idle_cpu_no_deep_c)
525 		return (B_FALSE);
526 
527 	if (!cpuid_deep_cstates_supported())
528 		return (B_FALSE);
529 
530 	if ((hpet.supported != HPET_FULL_SUPPORT) || !hpet.install_proxy())
531 		return (B_FALSE);
532 
533 	return (B_TRUE);
534 }
535 
536 /*
537  * Validate that this processor supports deep cstate and if so,
538  * get the c-state data from ACPI and cache it.
539  */
540 static int
541 cpu_idle_init(cpu_t *cp)
542 {
543 	cpupm_mach_state_t *mach_state =
544 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
545 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
546 	cpu_acpi_cstate_t *cstate;
547 	char name[KSTAT_STRLEN];
548 	int cpu_max_cstates, i;
549 	ACPI_TABLE_FADT *gbl_FADT;
550 
551 	/*
552 	 * Cache the C-state specific ACPI data.
553 	 */
554 	if (cpu_acpi_cache_cstate_data(handle) != 0) {
555 		cmn_err(CE_NOTE,
556 		    "!cpu_idle_init: Failed to cache ACPI C-state data\n");
557 		cpu_idle_fini(cp);
558 		return (-1);
559 	}
560 
561 	/*
562 	 * Check the bus master arbitration control ability.
563 	 */
564 	acpica_get_global_FADT(&gbl_FADT);
565 	if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength)
566 		CPU_ACPI_BM_INFO(handle) |= BM_CTL;
567 
568 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
569 
570 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
571 
572 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
573 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
574 		/*
575 		 * Allocate, initialize and install cstate kstat
576 		 */
577 		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
578 		    name, "misc",
579 		    KSTAT_TYPE_NAMED,
580 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
581 		    KSTAT_FLAG_VIRTUAL);
582 
583 		if (cstate->cs_ksp == NULL) {
584 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
585 		} else {
586 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
587 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
588 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
589 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
590 			cstate->cs_ksp->ks_private = cstate;
591 			kstat_install(cstate->cs_ksp);
592 			cstate++;
593 		}
594 	}
595 
596 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
597 	cpupm_alloc_ms_cstate(cp);
598 
599 	if (cpu_deep_cstates_supported()) {
600 		mutex_enter(&cpu_idle_callb_mutex);
601 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
602 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
603 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
604 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
605 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
606 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
607 		mutex_exit(&cpu_idle_callb_mutex);
608 	}
609 
610 	return (0);
611 }
612 
613 /*
614  * Free resources allocated by cpu_idle_init().
615  */
616 static void
617 cpu_idle_fini(cpu_t *cp)
618 {
619 	cpupm_mach_state_t *mach_state =
620 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
621 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
622 	cpu_acpi_cstate_t *cstate;
623 	uint_t	cpu_max_cstates, i;
624 
625 	/*
626 	 * idle cpu points back to the generic one
627 	 */
628 	idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
629 	disp_enq_thread = non_deep_idle_disp_enq_thread;
630 
631 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
632 	if (cstate) {
633 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
634 
635 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
636 			if (cstate->cs_ksp != NULL)
637 				kstat_delete(cstate->cs_ksp);
638 			cstate++;
639 		}
640 	}
641 
642 	cpupm_free_ms_cstate(cp);
643 	cpupm_free_domains(&cpupm_cstate_domains);
644 	cpu_acpi_free_cstate_data(handle);
645 
646 	mutex_enter(&cpu_idle_callb_mutex);
647 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
648 		(void) callb_delete(cpu_deep_idle_callb_id);
649 		cpu_deep_idle_callb_id = (callb_id_t)0;
650 	}
651 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
652 		(void) callb_delete(cpu_idle_cpr_callb_id);
653 		cpu_idle_cpr_callb_id = (callb_id_t)0;
654 	}
655 	mutex_exit(&cpu_idle_callb_mutex);
656 }
657 
658 /*ARGSUSED*/
659 static boolean_t
660 cpu_deep_idle_callb(void *arg, int code)
661 {
662 	boolean_t rslt = B_TRUE;
663 
664 	mutex_enter(&cpu_idle_callb_mutex);
665 	switch (code) {
666 	case PM_DEFAULT_CPU_DEEP_IDLE:
667 		/*
668 		 * Default policy is same as enable
669 		 */
670 		/*FALLTHROUGH*/
671 	case PM_ENABLE_CPU_DEEP_IDLE:
672 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
673 			break;
674 
675 		if (hpet.callback(PM_ENABLE_CPU_DEEP_IDLE)) {
676 			disp_enq_thread = cstate_wakeup;
677 			idle_cpu = cpu_idle_adaptive;
678 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
679 		} else {
680 			rslt = B_FALSE;
681 		}
682 		break;
683 
684 	case PM_DISABLE_CPU_DEEP_IDLE:
685 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
686 			break;
687 
688 		idle_cpu = non_deep_idle_cpu;
689 		if (hpet.callback(PM_DISABLE_CPU_DEEP_IDLE)) {
690 			disp_enq_thread = non_deep_idle_disp_enq_thread;
691 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
692 		}
693 		break;
694 
695 	default:
696 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
697 		    code);
698 		break;
699 	}
700 	mutex_exit(&cpu_idle_callb_mutex);
701 	return (rslt);
702 }
703 
704 /*ARGSUSED*/
705 static boolean_t
706 cpu_idle_cpr_callb(void *arg, int code)
707 {
708 	boolean_t rslt = B_TRUE;
709 
710 	mutex_enter(&cpu_idle_callb_mutex);
711 	switch (code) {
712 	case CB_CODE_CPR_RESUME:
713 		if (hpet.callback(CB_CODE_CPR_RESUME)) {
714 			/*
715 			 * Do not enable dispatcher hooks if disabled by user.
716 			 */
717 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
718 				break;
719 
720 			disp_enq_thread = cstate_wakeup;
721 			idle_cpu = cpu_idle_adaptive;
722 		} else {
723 			rslt = B_FALSE;
724 		}
725 		break;
726 
727 	case CB_CODE_CPR_CHKPT:
728 		idle_cpu = non_deep_idle_cpu;
729 		disp_enq_thread = non_deep_idle_disp_enq_thread;
730 		hpet.callback(CB_CODE_CPR_CHKPT);
731 		break;
732 
733 	default:
734 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
735 		break;
736 	}
737 	mutex_exit(&cpu_idle_callb_mutex);
738 	return (rslt);
739 }
740 
741 /*
742  * handle _CST notification
743  */
744 void
745 cpuidle_cstate_instance(cpu_t *cp)
746 {
747 #ifndef	__xpv
748 	cpupm_mach_state_t	*mach_state =
749 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
750 	cpu_acpi_handle_t	handle;
751 	struct machcpu		*mcpu;
752 	cpuset_t 		dom_cpu_set;
753 	kmutex_t		*pm_lock;
754 	int			result = 0;
755 	processorid_t		cpu_id;
756 
757 	if (mach_state == NULL) {
758 		return;
759 	}
760 
761 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
762 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
763 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
764 
765 	/*
766 	 * Do for all the CPU's in the domain
767 	 */
768 	mutex_enter(pm_lock);
769 	do {
770 		CPUSET_FIND(dom_cpu_set, cpu_id);
771 		if (cpu_id == CPUSET_NOTINSET)
772 			break;
773 
774 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
775 		cp = cpu[cpu_id];
776 		mach_state = (cpupm_mach_state_t *)
777 		    cp->cpu_m.mcpu_pm_mach_state;
778 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
779 			mutex_exit(pm_lock);
780 			return;
781 		}
782 		handle = mach_state->ms_acpi_handle;
783 		ASSERT(handle != NULL);
784 
785 		/*
786 		 * re-evaluate cstate object
787 		 */
788 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
789 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
790 			    " object Instance: %d", cpu_id);
791 		}
792 		mutex_enter(&cpu_lock);
793 		mcpu = &(cp->cpu_m);
794 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
795 		if (mcpu->max_cstates > CPU_ACPI_C1) {
796 			hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
797 			disp_enq_thread = cstate_wakeup;
798 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
799 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
800 			disp_enq_thread = non_deep_idle_disp_enq_thread;
801 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
802 			hpet.callback(CST_EVENT_ONE_CSTATE);
803 		}
804 		mutex_exit(&cpu_lock);
805 
806 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
807 		mutex_exit(pm_lock);
808 	} while (result < 0);
809 #endif
810 }
811 
812 /*
813  * handle the number or the type of available processor power states change
814  */
815 void
816 cpuidle_manage_cstates(void *ctx)
817 {
818 	cpu_t			*cp = ctx;
819 	processorid_t		cpu_id = cp->cpu_id;
820 	cpupm_mach_state_t	*mach_state =
821 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
822 	boolean_t		is_ready;
823 
824 	if (mach_state == NULL) {
825 		return;
826 	}
827 
828 	/*
829 	 * We currently refuse to power manage if the CPU is not ready to
830 	 * take cross calls (cross calls fail silently if CPU is not ready
831 	 * for it).
832 	 *
833 	 * Additionally, for x86 platforms we cannot power manage
834 	 * any one instance, until all instances have been initialized.
835 	 * That's because we don't know what the CPU domains look like
836 	 * until all instances have been initialized.
837 	 */
838 	is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_cstate_ready();
839 	if (!is_ready)
840 		return;
841 
842 	cpuidle_cstate_instance(cp);
843 }
844