xref: /titanic_50/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision 585995d5d19489bf178112c08c8c61ffc049ff6e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/x86_archext.h>
27 #include <sys/machsystm.h>
28 #include <sys/x_call.h>
29 #include <sys/stat.h>
30 #include <sys/acpi/acpi.h>
31 #include <sys/acpica.h>
32 #include <sys/cpu_acpi.h>
33 #include <sys/cpu_idle.h>
34 #include <sys/cpupm.h>
35 #include <sys/hpet.h>
36 #include <sys/archsystm.h>
37 #include <vm/hat_i86.h>
38 #include <sys/dtrace.h>
39 #include <sys/sdt.h>
40 #include <sys/callb.h>
41 
42 extern void cpu_idle_adaptive(void);
43 
44 static int cpu_idle_init(cpu_t *);
45 static void cpu_idle_fini(cpu_t *);
46 static boolean_t cpu_deep_idle_callb(void *arg, int code);
47 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
48 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
49 static void cpuidle_set_cstate_latency(cpu_t *cp);
50 
51 /*
52  * Interfaces for modules implementing Intel's deep c-state.
53  */
54 cpupm_state_ops_t cpu_idle_ops = {
55 	"Generic ACPI C-state Support",
56 	cpu_idle_init,
57 	cpu_idle_fini,
58 	NULL
59 };
60 
61 static kmutex_t		cpu_idle_callb_mutex;
62 static callb_id_t	cpu_deep_idle_callb_id;
63 static callb_id_t	cpu_idle_cpr_callb_id;
64 static uint_t		cpu_idle_cfg_state;
65 
66 static kmutex_t cpu_idle_mutex;
67 
68 cpu_idle_kstat_t cpu_idle_kstat = {
69 	{ "address_space_id",	KSTAT_DATA_STRING },
70 	{ "latency",		KSTAT_DATA_UINT32 },
71 	{ "power",		KSTAT_DATA_UINT32 },
72 };
73 
74 /*
75  * kstat update function of the c-state info
76  */
77 static int
78 cpu_idle_kstat_update(kstat_t *ksp, int flag)
79 {
80 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
81 
82 	if (flag == KSTAT_WRITE) {
83 		return (EACCES);
84 	}
85 
86 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
87 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
88 		"FFixedHW");
89 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
90 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
91 		"SystemIO");
92 	} else {
93 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
94 		"Unsupported");
95 	}
96 
97 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
98 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
99 
100 	return (0);
101 }
102 
103 /*
104  * c-state wakeup function.
105  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
106  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
107  */
108 void
109 cstate_wakeup(cpu_t *cp, int bound)
110 {
111 	struct machcpu	*mcpu = &(cp->cpu_m);
112 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
113 	cpupart_t	*cpu_part;
114 	uint_t		cpu_found;
115 	processorid_t	cpu_sid;
116 
117 	cpu_part = cp->cpu_part;
118 	cpu_sid = cp->cpu_seqid;
119 	/*
120 	 * Clear the halted bit for that CPU since it will be woken up
121 	 * in a moment.
122 	 */
123 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
124 		/*
125 		 * Clear the halted bit for that CPU since it will be
126 		 * poked in a moment.
127 		 */
128 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
129 
130 		/*
131 		 * We may find the current CPU present in the halted cpuset
132 		 * if we're in the context of an interrupt that occurred
133 		 * before we had a chance to clear our bit in cpu_idle().
134 		 * Waking ourself is obviously unnecessary, since if
135 		 * we're here, we're not halted.
136 		 */
137 		if (cp != CPU) {
138 			/*
139 			 * Use correct wakeup mechanism
140 			 */
141 			if ((mcpu_mwait != NULL) &&
142 			    (*mcpu_mwait == MWAIT_HALTED))
143 				MWAIT_WAKEUP(cp);
144 			else
145 				poke_cpu(cp->cpu_id);
146 		}
147 		return;
148 	} else {
149 		/*
150 		 * This cpu isn't halted, but it's idle or undergoing a
151 		 * context switch. No need to awaken anyone else.
152 		 */
153 		if (cp->cpu_thread == cp->cpu_idle_thread ||
154 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
155 			return;
156 	}
157 
158 	/*
159 	 * No need to wake up other CPUs if the thread we just enqueued
160 	 * is bound.
161 	 */
162 	if (bound)
163 		return;
164 
165 
166 	/*
167 	 * See if there's any other halted CPUs. If there are, then
168 	 * select one, and awaken it.
169 	 * It's possible that after we find a CPU, somebody else
170 	 * will awaken it before we get the chance.
171 	 * In that case, look again.
172 	 */
173 	do {
174 		cpu_found = bitset_find(&cpu_part->cp_haltset);
175 		if (cpu_found == (uint_t)-1)
176 			return;
177 
178 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
179 	    cpu_found) < 0);
180 
181 	/*
182 	 * Must use correct wakeup mechanism to avoid lost wakeup of
183 	 * alternate cpu.
184 	 */
185 	if (cpu_found != CPU->cpu_seqid) {
186 		mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait;
187 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
188 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
189 		else
190 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
191 	}
192 }
193 
194 /*
195  * enter deep c-state handler
196  */
197 static void
198 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
199 {
200 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
201 	cpu_t			*cpup = CPU;
202 	processorid_t		cpu_sid = cpup->cpu_seqid;
203 	cpupart_t		*cp = cpup->cpu_part;
204 	hrtime_t		lapic_expire;
205 	uint8_t			type = cstate->cs_addrspace_id;
206 	uint32_t		cs_type = cstate->cs_type;
207 	int			hset_update = 1;
208 	boolean_t		using_hpet_timer;
209 
210 	/*
211 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
212 	 * wake us between now and when we call mwait.  No other cpu will
213 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
214 	 */
215 	if (mcpu_mwait) {
216 		if (type == ACPI_ADR_SPACE_SYSTEM_IO)
217 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
218 		else
219 			*mcpu_mwait = MWAIT_HALTED;
220 	}
221 
222 	/*
223 	 * If this CPU is online, and there are multiple CPUs
224 	 * in the system, then we should note our halting
225 	 * by adding ourselves to the partition's halted CPU
226 	 * bitmap. This allows other CPUs to find/awaken us when
227 	 * work becomes available.
228 	 */
229 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
230 		hset_update = 0;
231 
232 	/*
233 	 * Add ourselves to the partition's halted CPUs bitmask
234 	 * and set our HALTED flag, if necessary.
235 	 *
236 	 * When a thread becomes runnable, it is placed on the queue
237 	 * and then the halted cpuset is checked to determine who
238 	 * (if anyone) should be awakened. We therefore need to first
239 	 * add ourselves to the halted cpuset, and and then check if there
240 	 * is any work available.
241 	 *
242 	 * Note that memory barriers after updating the HALTED flag
243 	 * are not necessary since an atomic operation (updating the bitmap)
244 	 * immediately follows. On x86 the atomic operation acts as a
245 	 * memory barrier for the update of cpu_disp_flags.
246 	 */
247 	if (hset_update) {
248 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
249 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
250 	}
251 
252 	/*
253 	 * Check to make sure there's really nothing to do.
254 	 * Work destined for this CPU may become available after
255 	 * this check. We'll be notified through the clearing of our
256 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
257 	 *
258 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
259 	 */
260 	if (disp_anywork()) {
261 		if (hset_update) {
262 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
263 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
264 		}
265 		return;
266 	}
267 
268 	/*
269 	 * We're on our way to being halted.
270 	 *
271 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
272 	 * Program the HPET hardware to substitute for this CPU's lAPIC timer.
273 	 * hpet.use_hpet_timer() disables the LAPIC Timer.  Make sure to
274 	 * start the LAPIC Timer again before leaving this function.
275 	 *
276 	 * hpet.use_hpet_timer disables interrupts, so we will awaken
277 	 * immediately after halting if someone tries to poke us between now
278 	 * and the time we actually halt.
279 	 */
280 	using_hpet_timer = hpet.use_hpet_timer(&lapic_expire);
281 
282 	/*
283 	 * We check for the presence of our bit after disabling interrupts.
284 	 * If it's cleared, we'll return. If the bit is cleared after
285 	 * we check then the cstate_wakeup() will pop us out of the halted
286 	 * state.
287 	 *
288 	 * This means that the ordering of the cstate_wakeup() and the clearing
289 	 * of the bit by cpu_wakeup is important.
290 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
291 	 * cstate_wakeup().
292 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
293 	 */
294 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
295 		hpet.use_lapic_timer(lapic_expire);
296 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
297 		return;
298 	}
299 
300 	/*
301 	 * The check for anything locally runnable is here for performance
302 	 * and isn't needed for correctness. disp_nrunnable ought to be
303 	 * in our cache still, so it's inexpensive to check, and if there
304 	 * is anything runnable we won't have to wait for the poke.
305 	 */
306 	if (cpup->cpu_disp->disp_nrunnable != 0) {
307 		hpet.use_lapic_timer(lapic_expire);
308 		if (hset_update) {
309 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
310 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
311 		}
312 		return;
313 	}
314 
315 	if (using_hpet_timer == B_FALSE) {
316 
317 		hpet.use_lapic_timer(lapic_expire);
318 
319 		/*
320 		 * We are currently unable to program the HPET to act as this
321 		 * CPU's proxy lAPIC timer.  This CPU cannot enter C2 or deeper
322 		 * because no timer is set to wake it up while its lAPIC timer
323 		 * stalls in deep C-States.
324 		 * Enter C1 instead.
325 		 *
326 		 * cstate_wake_cpu() will wake this CPU with an IPI which
327 		 * works with MWAIT.
328 		 */
329 		i86_monitor(mcpu_mwait, 0, 0);
330 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
331 			cpu_dtrace_idle_probe(CPU_ACPI_C1);
332 
333 			tlb_going_idle();
334 			i86_mwait(0, 0);
335 			tlb_service();
336 
337 			cpu_dtrace_idle_probe(CPU_ACPI_C0);
338 		}
339 
340 		/*
341 		 * We're no longer halted
342 		 */
343 		if (hset_update) {
344 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
345 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
346 		}
347 		return;
348 	}
349 
350 	cpu_dtrace_idle_probe((uint_t)cs_type);
351 
352 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
353 		/*
354 		 * We're on our way to being halted.
355 		 * To avoid a lost wakeup, arm the monitor before checking
356 		 * if another cpu wrote to mcpu_mwait to wake us up.
357 		 */
358 		i86_monitor(mcpu_mwait, 0, 0);
359 		if (*mcpu_mwait == MWAIT_HALTED) {
360 			uint32_t eax = cstate->cs_address;
361 			uint32_t ecx = 1;
362 
363 			tlb_going_idle();
364 			i86_mwait(eax, ecx);
365 			tlb_service();
366 		}
367 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
368 		uint32_t value;
369 		ACPI_TABLE_FADT *gbl_FADT;
370 
371 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
372 			tlb_going_idle();
373 			(void) cpu_acpi_read_port(cstate->cs_address,
374 			    &value, 8);
375 			acpica_get_global_FADT(&gbl_FADT);
376 			(void) cpu_acpi_read_port(
377 			    gbl_FADT->XPmTimerBlock.Address, &value, 32);
378 			tlb_service();
379 		}
380 	} else {
381 		cmn_err(CE_WARN, "!_CST: cs_type %lx bad asid type %lx\n",
382 		    (long)cs_type, (long)type);
383 	}
384 
385 	/*
386 	 * The lAPIC timer may have stopped in deep c-state.
387 	 * Reprogram this CPU's lAPIC here before enabling interrupts.
388 	 */
389 	hpet.use_lapic_timer(lapic_expire);
390 
391 	cpu_dtrace_idle_probe(CPU_ACPI_C0);
392 
393 	/*
394 	 * We're no longer halted
395 	 */
396 	if (hset_update) {
397 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
398 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
399 	}
400 }
401 
402 /*
403  * indicate when bus masters are active
404  */
405 static uint32_t
406 cpu_acpi_bm_sts(void)
407 {
408 	uint32_t bm_sts = 0;
409 
410 	cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts);
411 
412 	if (bm_sts)
413 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
414 
415 	return (bm_sts);
416 }
417 
418 /*
419  * Idle the present CPU, deep c-state is supported
420  */
421 void
422 cpu_acpi_idle(void)
423 {
424 	cpu_t *cp = CPU;
425 	uint16_t cs_type;
426 	cpu_acpi_handle_t handle;
427 	cma_c_state_t *cs_data;
428 	cpu_acpi_cstate_t *cstate;
429 	hrtime_t start, end;
430 	int cpu_max_cstates;
431 
432 	cpupm_mach_state_t *mach_state =
433 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
434 	handle = mach_state->ms_acpi_handle;
435 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
436 
437 	cs_data = mach_state->ms_cstate.cma_state.cstate;
438 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
439 	ASSERT(cstate != NULL);
440 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
441 	if (cpu_max_cstates > CPU_MAX_CSTATES)
442 		cpu_max_cstates = CPU_MAX_CSTATES;
443 
444 	start = gethrtime_unscaled();
445 
446 	cs_type = cpupm_next_cstate(cs_data, start);
447 
448 	/*
449 	 * OSPM uses the BM_STS bit to determine the power state to enter
450 	 * when considering a transition to or from the C2/C3 power state.
451 	 * if C3 is determined, bus master activity demotes the power state
452 	 * to C2.
453 	 */
454 	if ((cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts())
455 		cs_type = CPU_ACPI_C2;
456 
457 	/*
458 	 * BM_RLD determines if the Cx power state was exited as a result of
459 	 * bus master requests. Set this bit when using a C3 power state, and
460 	 * clear it when using a C1 or C2 power state.
461 	 */
462 	if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) {
463 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
464 		CPU_ACPI_BM_INFO(handle) &= ~BM_RLD;
465 	}
466 
467 	if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) &&
468 	    (cs_type >= CPU_ACPI_C3)) {
469 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
470 		CPU_ACPI_BM_INFO(handle) |= BM_RLD;
471 	}
472 
473 	cstate += cs_type - 1;
474 
475 	switch (cs_type) {
476 	default:
477 		/* FALLTHROUGH */
478 	case CPU_ACPI_C1:
479 		(*non_deep_idle_cpu)();
480 		break;
481 
482 	case CPU_ACPI_C2:
483 		acpi_cpu_cstate(cstate);
484 		break;
485 
486 	case CPU_ACPI_C3:
487 		/*
488 		 * recommended in ACPI spec, providing hardware mechanisms
489 		 * to prevent master from writing to memory (UP-only)
490 		 */
491 		if ((ncpus_online == 1) &&
492 		    (CPU_ACPI_BM_INFO(handle) & BM_CTL)) {
493 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
494 			CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS;
495 		/*
496 		 * Today all Intel's processor support C3 share cache.
497 		 */
498 		} else if (x86_vendor != X86_VENDOR_Intel) {
499 			__acpi_wbinvd();
500 		}
501 		acpi_cpu_cstate(cstate);
502 		if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) {
503 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
504 			CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS;
505 		}
506 		break;
507 	}
508 
509 	end = gethrtime_unscaled();
510 
511 	/*
512 	 * Update statistics
513 	 */
514 	cpupm_wakeup_cstate_data(cs_data, end);
515 }
516 
517 boolean_t
518 cpu_deep_cstates_supported(void)
519 {
520 	extern int	idle_cpu_no_deep_c;
521 
522 	if (idle_cpu_no_deep_c)
523 		return (B_FALSE);
524 
525 	if (!cpuid_deep_cstates_supported())
526 		return (B_FALSE);
527 
528 	if ((hpet.supported != HPET_FULL_SUPPORT) || !hpet.install_proxy())
529 		return (B_FALSE);
530 
531 	return (B_TRUE);
532 }
533 
534 /*
535  * Validate that this processor supports deep cstate and if so,
536  * get the c-state data from ACPI and cache it.
537  */
538 static int
539 cpu_idle_init(cpu_t *cp)
540 {
541 	cpupm_mach_state_t *mach_state =
542 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
543 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
544 	cpu_acpi_cstate_t *cstate;
545 	char name[KSTAT_STRLEN];
546 	int cpu_max_cstates, i;
547 	ACPI_TABLE_FADT *gbl_FADT;
548 
549 	/*
550 	 * Cache the C-state specific ACPI data.
551 	 */
552 	if (cpu_acpi_cache_cstate_data(handle) != 0) {
553 		cmn_err(CE_NOTE,
554 		    "!cpu_idle_init: Failed to cache ACPI C-state data\n");
555 		cpu_idle_fini(cp);
556 		return (-1);
557 	}
558 
559 	/*
560 	 * Check the bus master arbitration control ability.
561 	 */
562 	acpica_get_global_FADT(&gbl_FADT);
563 	if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength)
564 		CPU_ACPI_BM_INFO(handle) |= BM_CTL;
565 
566 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
567 
568 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
569 
570 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
571 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
572 		/*
573 		 * Allocate, initialize and install cstate kstat
574 		 */
575 		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
576 		    name, "misc",
577 		    KSTAT_TYPE_NAMED,
578 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
579 		    KSTAT_FLAG_VIRTUAL);
580 
581 		if (cstate->cs_ksp == NULL) {
582 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
583 		} else {
584 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
585 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
586 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
587 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
588 			cstate->cs_ksp->ks_private = cstate;
589 			kstat_install(cstate->cs_ksp);
590 			cstate++;
591 		}
592 	}
593 
594 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
595 	cpupm_alloc_ms_cstate(cp);
596 	cpuidle_set_cstate_latency(cp);
597 
598 	if (cpu_deep_cstates_supported()) {
599 		mutex_enter(&cpu_idle_callb_mutex);
600 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
601 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
602 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
603 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
604 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
605 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
606 		mutex_exit(&cpu_idle_callb_mutex);
607 	}
608 
609 	return (0);
610 }
611 
612 /*
613  * Free resources allocated by cpu_idle_init().
614  */
615 static void
616 cpu_idle_fini(cpu_t *cp)
617 {
618 	cpupm_mach_state_t *mach_state =
619 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
620 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
621 	cpu_acpi_cstate_t *cstate;
622 	uint_t	cpu_max_cstates, i;
623 
624 	/*
625 	 * idle cpu points back to the generic one
626 	 */
627 	idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
628 	disp_enq_thread = non_deep_idle_disp_enq_thread;
629 
630 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
631 	if (cstate) {
632 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
633 
634 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
635 			if (cstate->cs_ksp != NULL)
636 				kstat_delete(cstate->cs_ksp);
637 			cstate++;
638 		}
639 	}
640 
641 	cpupm_free_ms_cstate(cp);
642 	cpupm_free_domains(&cpupm_cstate_domains);
643 	cpu_acpi_free_cstate_data(handle);
644 
645 	mutex_enter(&cpu_idle_callb_mutex);
646 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
647 		(void) callb_delete(cpu_deep_idle_callb_id);
648 		cpu_deep_idle_callb_id = (callb_id_t)0;
649 	}
650 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
651 		(void) callb_delete(cpu_idle_cpr_callb_id);
652 		cpu_idle_cpr_callb_id = (callb_id_t)0;
653 	}
654 	mutex_exit(&cpu_idle_callb_mutex);
655 }
656 
657 /*ARGSUSED*/
658 static boolean_t
659 cpu_deep_idle_callb(void *arg, int code)
660 {
661 	boolean_t rslt = B_TRUE;
662 
663 	mutex_enter(&cpu_idle_callb_mutex);
664 	switch (code) {
665 	case PM_DEFAULT_CPU_DEEP_IDLE:
666 		/*
667 		 * Default policy is same as enable
668 		 */
669 		/*FALLTHROUGH*/
670 	case PM_ENABLE_CPU_DEEP_IDLE:
671 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
672 			break;
673 
674 		if (hpet.callback(PM_ENABLE_CPU_DEEP_IDLE)) {
675 			disp_enq_thread = cstate_wakeup;
676 			idle_cpu = cpu_idle_adaptive;
677 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
678 		} else {
679 			rslt = B_FALSE;
680 		}
681 		break;
682 
683 	case PM_DISABLE_CPU_DEEP_IDLE:
684 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
685 			break;
686 
687 		idle_cpu = non_deep_idle_cpu;
688 		if (hpet.callback(PM_DISABLE_CPU_DEEP_IDLE)) {
689 			disp_enq_thread = non_deep_idle_disp_enq_thread;
690 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
691 		}
692 		break;
693 
694 	default:
695 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
696 		    code);
697 		break;
698 	}
699 	mutex_exit(&cpu_idle_callb_mutex);
700 	return (rslt);
701 }
702 
703 /*ARGSUSED*/
704 static boolean_t
705 cpu_idle_cpr_callb(void *arg, int code)
706 {
707 	boolean_t rslt = B_TRUE;
708 
709 	mutex_enter(&cpu_idle_callb_mutex);
710 	switch (code) {
711 	case CB_CODE_CPR_RESUME:
712 		if (hpet.callback(CB_CODE_CPR_RESUME)) {
713 			/*
714 			 * Do not enable dispatcher hooks if disabled by user.
715 			 */
716 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
717 				break;
718 
719 			disp_enq_thread = cstate_wakeup;
720 			idle_cpu = cpu_idle_adaptive;
721 		} else {
722 			rslt = B_FALSE;
723 		}
724 		break;
725 
726 	case CB_CODE_CPR_CHKPT:
727 		idle_cpu = non_deep_idle_cpu;
728 		disp_enq_thread = non_deep_idle_disp_enq_thread;
729 		hpet.callback(CB_CODE_CPR_CHKPT);
730 		break;
731 
732 	default:
733 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
734 		break;
735 	}
736 	mutex_exit(&cpu_idle_callb_mutex);
737 	return (rslt);
738 }
739 
740 /*
741  * handle _CST notification
742  */
743 void
744 cpuidle_cstate_instance(cpu_t *cp)
745 {
746 #ifndef	__xpv
747 	cpupm_mach_state_t	*mach_state =
748 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
749 	cpu_acpi_handle_t	handle;
750 	struct machcpu		*mcpu;
751 	cpuset_t 		dom_cpu_set;
752 	kmutex_t		*pm_lock;
753 	int			result = 0;
754 	processorid_t		cpu_id;
755 
756 	if (mach_state == NULL) {
757 		return;
758 	}
759 
760 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
761 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
762 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
763 
764 	/*
765 	 * Do for all the CPU's in the domain
766 	 */
767 	mutex_enter(pm_lock);
768 	do {
769 		CPUSET_FIND(dom_cpu_set, cpu_id);
770 		if (cpu_id == CPUSET_NOTINSET)
771 			break;
772 
773 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
774 		cp = cpu[cpu_id];
775 		mach_state = (cpupm_mach_state_t *)
776 		    cp->cpu_m.mcpu_pm_mach_state;
777 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
778 			mutex_exit(pm_lock);
779 			return;
780 		}
781 		handle = mach_state->ms_acpi_handle;
782 		ASSERT(handle != NULL);
783 
784 		/*
785 		 * re-evaluate cstate object
786 		 */
787 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
788 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
789 			    " object Instance: %d", cpu_id);
790 		}
791 		mutex_enter(&cpu_lock);
792 		mcpu = &(cp->cpu_m);
793 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
794 		if (mcpu->max_cstates > CPU_ACPI_C1) {
795 			hpet.callback(CST_EVENT_MULTIPLE_CSTATES);
796 			disp_enq_thread = cstate_wakeup;
797 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
798 			cpuidle_set_cstate_latency(cp);
799 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
800 			disp_enq_thread = non_deep_idle_disp_enq_thread;
801 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
802 			hpet.callback(CST_EVENT_ONE_CSTATE);
803 		}
804 		mutex_exit(&cpu_lock);
805 
806 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
807 		mutex_exit(pm_lock);
808 	} while (result < 0);
809 #endif
810 }
811 
812 /*
813  * handle the number or the type of available processor power states change
814  */
815 void
816 cpuidle_manage_cstates(void *ctx)
817 {
818 	cpu_t			*cp = ctx;
819 	processorid_t		cpu_id = cp->cpu_id;
820 	cpupm_mach_state_t	*mach_state =
821 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
822 	boolean_t		is_ready;
823 
824 	if (mach_state == NULL) {
825 		return;
826 	}
827 
828 	/*
829 	 * We currently refuse to power manage if the CPU is not ready to
830 	 * take cross calls (cross calls fail silently if CPU is not ready
831 	 * for it).
832 	 *
833 	 * Additionally, for x86 platforms we cannot power manage
834 	 * any one instance, until all instances have been initialized.
835 	 * That's because we don't know what the CPU domains look like
836 	 * until all instances have been initialized.
837 	 */
838 	is_ready = CPUPM_XCALL_IS_READY(cpu_id) && cpupm_cstate_ready();
839 	if (!is_ready)
840 		return;
841 
842 	cpuidle_cstate_instance(cp);
843 }
844 
845 static void
846 cpuidle_set_cstate_latency(cpu_t *cp)
847 {
848 	cpupm_mach_state_t	*mach_state =
849 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
850 	cpu_acpi_handle_t	handle;
851 	cpu_acpi_cstate_t	*acpi_cstates;
852 	cma_c_state_t		*cpupm_cdata;
853 	uint32_t		i, cnt;
854 
855 	cpupm_cdata = mach_state->ms_cstate.cma_state.cstate;
856 
857 	ASSERT(cpupm_cdata != 0);
858 	ASSERT(mach_state != NULL);
859 	handle = mach_state->ms_acpi_handle;
860 	ASSERT(handle != NULL);
861 
862 	cnt = CPU_ACPI_CSTATES_COUNT(handle);
863 	acpi_cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
864 
865 	cpupm_cdata->cs_C2_latency = CPU_CSTATE_LATENCY_UNDEF;
866 	cpupm_cdata->cs_C3_latency = CPU_CSTATE_LATENCY_UNDEF;
867 
868 	for (i = 1; i <= cnt; ++i, ++acpi_cstates) {
869 		if ((cpupm_cdata->cs_C2_latency == CPU_CSTATE_LATENCY_UNDEF) &&
870 		    (acpi_cstates->cs_type == CPU_ACPI_C2))
871 			cpupm_cdata->cs_C2_latency =  acpi_cstates->cs_latency;
872 
873 		if ((cpupm_cdata->cs_C3_latency == CPU_CSTATE_LATENCY_UNDEF) &&
874 		    (acpi_cstates->cs_type == CPU_ACPI_C3))
875 			cpupm_cdata->cs_C3_latency =  acpi_cstates->cs_latency;
876 	}
877 }
878