xref: /titanic_51/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision 37714ae43602c675f9dc59b070bfdf9fa702872c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009, Intel Corporation.
27  * All rights reserved.
28  */
29 
30 #include <sys/x86_archext.h>
31 #include <sys/machsystm.h>
32 #include <sys/x_call.h>
33 #include <sys/stat.h>
34 #include <sys/acpi/acpi.h>
35 #include <sys/acpica.h>
36 #include <sys/cpu_acpi.h>
37 #include <sys/cpu_idle.h>
38 #include <sys/cpupm.h>
39 #include <sys/cpu_event.h>
40 #include <sys/hpet.h>
41 #include <sys/archsystm.h>
42 #include <vm/hat_i86.h>
43 #include <sys/dtrace.h>
44 #include <sys/sdt.h>
45 #include <sys/callb.h>
46 
47 #define	CSTATE_USING_HPET		1
48 #define	CSTATE_USING_LAT		2
49 
50 extern void cpu_idle_adaptive(void);
51 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
52     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
53 
54 static int cpu_idle_init(cpu_t *);
55 static void cpu_idle_fini(cpu_t *);
56 static boolean_t cpu_deep_idle_callb(void *arg, int code);
57 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
58 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
59 
60 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
61 
62 /*
63  * the flag of always-running local APIC timer.
64  * the flag of HPET Timer use in deep cstate.
65  */
66 static boolean_t cpu_cstate_arat = B_FALSE;
67 static boolean_t cpu_cstate_hpet = B_FALSE;
68 
69 /*
70  * Interfaces for modules implementing Intel's deep c-state.
71  */
72 cpupm_state_ops_t cpu_idle_ops = {
73 	"Generic ACPI C-state Support",
74 	cpu_idle_init,
75 	cpu_idle_fini,
76 	NULL
77 };
78 
79 static kmutex_t		cpu_idle_callb_mutex;
80 static callb_id_t	cpu_deep_idle_callb_id;
81 static callb_id_t	cpu_idle_cpr_callb_id;
82 static uint_t		cpu_idle_cfg_state;
83 
84 static kmutex_t cpu_idle_mutex;
85 
86 cpu_idle_kstat_t cpu_idle_kstat = {
87 	{ "address_space_id",	KSTAT_DATA_STRING },
88 	{ "latency",		KSTAT_DATA_UINT32 },
89 	{ "power",		KSTAT_DATA_UINT32 },
90 };
91 
92 /*
93  * kstat update function of the c-state info
94  */
95 static int
96 cpu_idle_kstat_update(kstat_t *ksp, int flag)
97 {
98 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
99 
100 	if (flag == KSTAT_WRITE) {
101 		return (EACCES);
102 	}
103 
104 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
105 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
106 		"FFixedHW");
107 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
108 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
109 		"SystemIO");
110 	} else {
111 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
112 		"Unsupported");
113 	}
114 
115 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
116 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
117 
118 	return (0);
119 }
120 
121 /*
122  * Used during configuration callbacks to manage implementation specific
123  * details of the hardware timer used during Deep C-state.
124  */
125 boolean_t
126 cstate_timer_callback(int code)
127 {
128 	if (cpu_cstate_arat) {
129 		return (B_TRUE);
130 	} else if (cpu_cstate_hpet) {
131 		return (hpet.callback(code));
132 	}
133 	return (B_FALSE);
134 }
135 
136 /*
137  * Some Local APIC Timers do not work during Deep C-states.
138  * The Deep C-state idle function uses this function to ensure it is using a
139  * hardware timer that works during Deep C-states.  This function also
140  * switches the timer back to the LACPI Timer after Deep C-state.
141  */
142 static boolean_t
143 cstate_use_timer(hrtime_t *lapic_expire, int timer)
144 {
145 	if (cpu_cstate_arat)
146 		return (B_TRUE);
147 
148 	/*
149 	 * We have to return B_FALSE if no arat or hpet support
150 	 */
151 	if (!cpu_cstate_hpet)
152 		return (B_FALSE);
153 
154 	switch (timer) {
155 	case CSTATE_USING_HPET:
156 		return (hpet.use_hpet_timer(lapic_expire));
157 	case CSTATE_USING_LAT:
158 		hpet.use_lapic_timer(*lapic_expire);
159 		return (B_TRUE);
160 	default:
161 		return (B_FALSE);
162 	}
163 }
164 
165 /*
166  * c-state wakeup function.
167  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
168  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
169  */
170 void
171 cstate_wakeup(cpu_t *cp, int bound)
172 {
173 	struct machcpu	*mcpu = &(cp->cpu_m);
174 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
175 	cpupart_t	*cpu_part;
176 	uint_t		cpu_found;
177 	processorid_t	cpu_sid;
178 
179 	cpu_part = cp->cpu_part;
180 	cpu_sid = cp->cpu_seqid;
181 	/*
182 	 * Clear the halted bit for that CPU since it will be woken up
183 	 * in a moment.
184 	 */
185 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
186 		/*
187 		 * Clear the halted bit for that CPU since it will be
188 		 * poked in a moment.
189 		 */
190 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
191 
192 		/*
193 		 * We may find the current CPU present in the halted cpuset
194 		 * if we're in the context of an interrupt that occurred
195 		 * before we had a chance to clear our bit in cpu_idle().
196 		 * Waking ourself is obviously unnecessary, since if
197 		 * we're here, we're not halted.
198 		 */
199 		if (cp != CPU) {
200 			/*
201 			 * Use correct wakeup mechanism
202 			 */
203 			if ((mcpu_mwait != NULL) &&
204 			    (*mcpu_mwait == MWAIT_HALTED))
205 				MWAIT_WAKEUP(cp);
206 			else
207 				poke_cpu(cp->cpu_id);
208 		}
209 		return;
210 	} else {
211 		/*
212 		 * This cpu isn't halted, but it's idle or undergoing a
213 		 * context switch. No need to awaken anyone else.
214 		 */
215 		if (cp->cpu_thread == cp->cpu_idle_thread ||
216 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
217 			return;
218 	}
219 
220 	/*
221 	 * No need to wake up other CPUs if the thread we just enqueued
222 	 * is bound.
223 	 */
224 	if (bound)
225 		return;
226 
227 
228 	/*
229 	 * See if there's any other halted CPUs. If there are, then
230 	 * select one, and awaken it.
231 	 * It's possible that after we find a CPU, somebody else
232 	 * will awaken it before we get the chance.
233 	 * In that case, look again.
234 	 */
235 	do {
236 		cpu_found = bitset_find(&cpu_part->cp_haltset);
237 		if (cpu_found == (uint_t)-1)
238 			return;
239 
240 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
241 	    cpu_found) < 0);
242 
243 	/*
244 	 * Must use correct wakeup mechanism to avoid lost wakeup of
245 	 * alternate cpu.
246 	 */
247 	if (cpu_found != CPU->cpu_seqid) {
248 		mcpu_mwait = cpu[cpu_found]->cpu_m.mcpu_mwait;
249 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
250 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
251 		else
252 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
253 	}
254 }
255 
256 /*
257  * Function called by CPU idle notification framework to check whether CPU
258  * has been awakened. It will be called with interrupt disabled.
259  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
260  * notification framework.
261  */
262 static void
263 acpi_cpu_mwait_check_wakeup(void *arg)
264 {
265 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
266 
267 	ASSERT(arg != NULL);
268 	if (*mcpu_mwait != MWAIT_HALTED) {
269 		/*
270 		 * CPU has been awakened, notify CPU idle notification system.
271 		 */
272 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
273 	} else {
274 		/*
275 		 * Toggle interrupt flag to detect pending interrupts.
276 		 * If interrupt happened, do_interrupt() will notify CPU idle
277 		 * notification framework so no need to call cpu_idle_exit()
278 		 * here.
279 		 */
280 		sti();
281 		SMT_PAUSE();
282 		cli();
283 	}
284 }
285 
286 static void
287 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
288 {
289 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
290 
291 	ASSERT(arg != NULL);
292 	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
293 		/*
294 		 * CPU has been awakened, notify CPU idle notification system.
295 		 */
296 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
297 	} else {
298 		/*
299 		 * Toggle interrupt flag to detect pending interrupts.
300 		 * If interrupt happened, do_interrupt() will notify CPU idle
301 		 * notification framework so no need to call cpu_idle_exit()
302 		 * here.
303 		 */
304 		sti();
305 		SMT_PAUSE();
306 		cli();
307 	}
308 }
309 
310 /*ARGSUSED*/
311 static void
312 acpi_cpu_check_wakeup(void *arg)
313 {
314 	/*
315 	 * Toggle interrupt flag to detect pending interrupts.
316 	 * If interrupt happened, do_interrupt() will notify CPU idle
317 	 * notification framework so no need to call cpu_idle_exit() here.
318 	 */
319 	sti();
320 	SMT_PAUSE();
321 	cli();
322 }
323 
324 /*
325  * enter deep c-state handler
326  */
327 static void
328 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
329 {
330 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
331 	cpu_t			*cpup = CPU;
332 	processorid_t		cpu_sid = cpup->cpu_seqid;
333 	cpupart_t		*cp = cpup->cpu_part;
334 	hrtime_t		lapic_expire;
335 	uint8_t			type = cstate->cs_addrspace_id;
336 	uint32_t		cs_type = cstate->cs_type;
337 	int			hset_update = 1;
338 	boolean_t		using_timer;
339 	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
340 
341 	/*
342 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
343 	 * wake us between now and when we call mwait.  No other cpu will
344 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
345 	 */
346 	if (mcpu_mwait) {
347 		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
348 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
349 			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
350 		} else {
351 			*mcpu_mwait = MWAIT_HALTED;
352 			check_func = &acpi_cpu_mwait_check_wakeup;
353 		}
354 	}
355 
356 	/*
357 	 * If this CPU is online, and there are multiple CPUs
358 	 * in the system, then we should note our halting
359 	 * by adding ourselves to the partition's halted CPU
360 	 * bitmap. This allows other CPUs to find/awaken us when
361 	 * work becomes available.
362 	 */
363 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
364 		hset_update = 0;
365 
366 	/*
367 	 * Add ourselves to the partition's halted CPUs bitmask
368 	 * and set our HALTED flag, if necessary.
369 	 *
370 	 * When a thread becomes runnable, it is placed on the queue
371 	 * and then the halted cpuset is checked to determine who
372 	 * (if anyone) should be awakened. We therefore need to first
373 	 * add ourselves to the halted cpuset, and and then check if there
374 	 * is any work available.
375 	 *
376 	 * Note that memory barriers after updating the HALTED flag
377 	 * are not necessary since an atomic operation (updating the bitmap)
378 	 * immediately follows. On x86 the atomic operation acts as a
379 	 * memory barrier for the update of cpu_disp_flags.
380 	 */
381 	if (hset_update) {
382 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
383 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
384 	}
385 
386 	/*
387 	 * Check to make sure there's really nothing to do.
388 	 * Work destined for this CPU may become available after
389 	 * this check. We'll be notified through the clearing of our
390 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
391 	 *
392 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
393 	 */
394 	if (disp_anywork()) {
395 		if (hset_update) {
396 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
397 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
398 		}
399 		return;
400 	}
401 
402 	/*
403 	 * We're on our way to being halted.
404 	 *
405 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
406 	 * Try to program the HPET hardware to substitute for this CPU's
407 	 * LAPIC timer.
408 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
409 	 * to start the LAPIC Timer again before leaving this function.
410 	 *
411 	 * Disable interrupts here so we will awaken immediately after halting
412 	 * if someone tries to poke us between now and the time we actually
413 	 * halt.
414 	 */
415 	cli();
416 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
417 
418 	/*
419 	 * We check for the presence of our bit after disabling interrupts.
420 	 * If it's cleared, we'll return. If the bit is cleared after
421 	 * we check then the cstate_wakeup() will pop us out of the halted
422 	 * state.
423 	 *
424 	 * This means that the ordering of the cstate_wakeup() and the clearing
425 	 * of the bit by cpu_wakeup is important.
426 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
427 	 * cstate_wakeup().
428 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
429 	 */
430 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
431 		(void) cstate_use_timer(&lapic_expire,
432 		    CSTATE_USING_LAT);
433 		sti();
434 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
435 		return;
436 	}
437 
438 	/*
439 	 * The check for anything locally runnable is here for performance
440 	 * and isn't needed for correctness. disp_nrunnable ought to be
441 	 * in our cache still, so it's inexpensive to check, and if there
442 	 * is anything runnable we won't have to wait for the poke.
443 	 */
444 	if (cpup->cpu_disp->disp_nrunnable != 0) {
445 		(void) cstate_use_timer(&lapic_expire,
446 		    CSTATE_USING_LAT);
447 		sti();
448 		if (hset_update) {
449 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
450 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
451 		}
452 		return;
453 	}
454 
455 	if (using_timer == B_FALSE) {
456 
457 		(void) cstate_use_timer(&lapic_expire,
458 		    CSTATE_USING_LAT);
459 		sti();
460 
461 		/*
462 		 * We are currently unable to program the HPET to act as this
463 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
464 		 * because no timer is set to wake it up while its LAPIC timer
465 		 * stalls in deep C-States.
466 		 * Enter C1 instead.
467 		 *
468 		 * cstate_wake_cpu() will wake this CPU with an IPI which
469 		 * works with MWAIT.
470 		 */
471 		i86_monitor(mcpu_mwait, 0, 0);
472 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
473 			if (cpu_idle_enter(IDLE_STATE_C1, 0,
474 			    check_func, (void *)mcpu_mwait) == 0) {
475 				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
476 				    MWAIT_HALTED) {
477 					i86_mwait(0, 0);
478 				}
479 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
480 			}
481 		}
482 
483 		/*
484 		 * We're no longer halted
485 		 */
486 		if (hset_update) {
487 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
488 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
489 		}
490 		return;
491 	}
492 
493 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
494 		/*
495 		 * We're on our way to being halted.
496 		 * To avoid a lost wakeup, arm the monitor before checking
497 		 * if another cpu wrote to mcpu_mwait to wake us up.
498 		 */
499 		i86_monitor(mcpu_mwait, 0, 0);
500 		if (*mcpu_mwait == MWAIT_HALTED) {
501 			if (cpu_idle_enter((uint_t)cs_type, 0,
502 			    check_func, (void *)mcpu_mwait) == 0) {
503 				if (*mcpu_mwait == MWAIT_HALTED) {
504 					i86_mwait(cstate->cs_address, 1);
505 				}
506 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
507 			}
508 		}
509 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
510 		uint32_t value;
511 		ACPI_TABLE_FADT *gbl_FADT;
512 
513 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
514 			if (cpu_idle_enter((uint_t)cs_type, 0,
515 			    check_func, (void *)mcpu_mwait) == 0) {
516 				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
517 					(void) cpu_acpi_read_port(
518 					    cstate->cs_address, &value, 8);
519 					acpica_get_global_FADT(&gbl_FADT);
520 					(void) cpu_acpi_read_port(
521 					    gbl_FADT->XPmTimerBlock.Address,
522 					    &value, 32);
523 				}
524 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
525 			}
526 		}
527 	}
528 
529 	/*
530 	 * The LAPIC timer may have stopped in deep c-state.
531 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
532 	 */
533 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
534 	sti();
535 
536 	/*
537 	 * We're no longer halted
538 	 */
539 	if (hset_update) {
540 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
541 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
542 	}
543 }
544 
545 /*
546  * indicate when bus masters are active
547  */
548 static uint32_t
549 cpu_acpi_bm_sts(void)
550 {
551 	uint32_t bm_sts = 0;
552 
553 	cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_sts);
554 
555 	if (bm_sts)
556 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
557 
558 	return (bm_sts);
559 }
560 
561 /*
562  * Idle the present CPU, deep c-state is supported
563  */
564 void
565 cpu_acpi_idle(void)
566 {
567 	cpu_t *cp = CPU;
568 	cpu_acpi_handle_t handle;
569 	cma_c_state_t *cs_data;
570 	cpu_acpi_cstate_t *cstates;
571 	hrtime_t start, end;
572 	int cpu_max_cstates;
573 	uint32_t cs_indx;
574 	uint16_t cs_type;
575 
576 	cpupm_mach_state_t *mach_state =
577 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
578 	handle = mach_state->ms_acpi_handle;
579 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
580 
581 	cs_data = mach_state->ms_cstate.cma_state.cstate;
582 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
583 	ASSERT(cstates != NULL);
584 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
585 	if (cpu_max_cstates > CPU_MAX_CSTATES)
586 		cpu_max_cstates = CPU_MAX_CSTATES;
587 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
588 		(*non_deep_idle_cpu)();
589 		return;
590 	}
591 
592 	start = gethrtime_unscaled();
593 
594 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
595 
596 	/*
597 	 * OSPM uses the BM_STS bit to determine the power state to enter
598 	 * when considering a transition to or from the C2/C3 power state.
599 	 * if C3 is determined, bus master activity demotes the power state
600 	 * to C2.
601 	 */
602 	if ((cstates[cs_indx].cs_type >= CPU_ACPI_C3) && cpu_acpi_bm_sts())
603 		--cs_indx;
604 	cs_type = cstates[cs_indx].cs_type;
605 
606 	/*
607 	 * BM_RLD determines if the Cx power state was exited as a result of
608 	 * bus master requests. Set this bit when using a C3 power state, and
609 	 * clear it when using a C1 or C2 power state.
610 	 */
611 	if ((CPU_ACPI_BM_INFO(handle) & BM_RLD) && (cs_type < CPU_ACPI_C3)) {
612 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
613 		CPU_ACPI_BM_INFO(handle) &= ~BM_RLD;
614 	}
615 
616 	if ((!(CPU_ACPI_BM_INFO(handle) & BM_RLD)) &&
617 	    (cs_type >= CPU_ACPI_C3)) {
618 		cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
619 		CPU_ACPI_BM_INFO(handle) |= BM_RLD;
620 	}
621 
622 	switch (cs_type) {
623 	default:
624 		/* FALLTHROUGH */
625 	case CPU_ACPI_C1:
626 		(*non_deep_idle_cpu)();
627 		break;
628 
629 	case CPU_ACPI_C2:
630 		acpi_cpu_cstate(&cstates[cs_indx]);
631 		break;
632 
633 	case CPU_ACPI_C3:
634 		/*
635 		 * recommended in ACPI spec, providing hardware mechanisms
636 		 * to prevent master from writing to memory (UP-only)
637 		 */
638 		if ((ncpus_online == 1) &&
639 		    (CPU_ACPI_BM_INFO(handle) & BM_CTL)) {
640 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
641 			CPU_ACPI_BM_INFO(handle) |= BM_ARB_DIS;
642 		/*
643 		 * Today all Intel's processor support C3 share cache.
644 		 */
645 		} else if (x86_vendor != X86_VENDOR_Intel) {
646 			__acpi_wbinvd();
647 		}
648 		acpi_cpu_cstate(&cstates[cs_indx]);
649 		if (CPU_ACPI_BM_INFO(handle) & BM_ARB_DIS) {
650 			cpu_acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
651 			CPU_ACPI_BM_INFO(handle) &= ~BM_ARB_DIS;
652 		}
653 		break;
654 	}
655 
656 	end = gethrtime_unscaled();
657 
658 	/*
659 	 * Update statistics
660 	 */
661 	cpupm_wakeup_cstate_data(cs_data, end);
662 }
663 
664 boolean_t
665 cpu_deep_cstates_supported(void)
666 {
667 	extern int	idle_cpu_no_deep_c;
668 
669 	if (idle_cpu_no_deep_c)
670 		return (B_FALSE);
671 
672 	if (!cpuid_deep_cstates_supported())
673 		return (B_FALSE);
674 
675 	if (cpuid_arat_supported()) {
676 		cpu_cstate_arat = B_TRUE;
677 		return (B_TRUE);
678 	}
679 
680 	if ((hpet.supported == HPET_FULL_SUPPORT) &&
681 	    hpet.install_proxy()) {
682 		cpu_cstate_hpet = B_TRUE;
683 		return (B_TRUE);
684 	}
685 
686 	return (B_FALSE);
687 }
688 
689 /*
690  * Validate that this processor supports deep cstate and if so,
691  * get the c-state data from ACPI and cache it.
692  */
693 static int
694 cpu_idle_init(cpu_t *cp)
695 {
696 	cpupm_mach_state_t *mach_state =
697 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
698 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
699 	cpu_acpi_cstate_t *cstate;
700 	char name[KSTAT_STRLEN];
701 	int cpu_max_cstates, i;
702 	ACPI_TABLE_FADT *gbl_FADT;
703 	int ret;
704 
705 	/*
706 	 * Cache the C-state specific ACPI data.
707 	 */
708 	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
709 		if (ret < 0)
710 			cmn_err(CE_NOTE,
711 			    "!Support for CPU deep idle states is being "
712 			    "disabled due to errors parsing ACPI C-state "
713 			    "objects exported by BIOS.");
714 		cpu_idle_fini(cp);
715 		return (-1);
716 	}
717 
718 	/*
719 	 * Check the bus master arbitration control ability.
720 	 */
721 	acpica_get_global_FADT(&gbl_FADT);
722 	if (gbl_FADT->Pm2ControlBlock && gbl_FADT->Pm2ControlLength)
723 		CPU_ACPI_BM_INFO(handle) |= BM_CTL;
724 
725 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
726 
727 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
728 
729 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
730 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
731 		/*
732 		 * Allocate, initialize and install cstate kstat
733 		 */
734 		cstate->cs_ksp = kstat_create("cstate", CPU->cpu_id,
735 		    name, "misc",
736 		    KSTAT_TYPE_NAMED,
737 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
738 		    KSTAT_FLAG_VIRTUAL);
739 
740 		if (cstate->cs_ksp == NULL) {
741 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
742 		} else {
743 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
744 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
745 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
746 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
747 			cstate->cs_ksp->ks_private = cstate;
748 			kstat_install(cstate->cs_ksp);
749 			cstate++;
750 		}
751 	}
752 
753 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
754 	cpupm_alloc_ms_cstate(cp);
755 
756 	if (cpu_deep_cstates_supported()) {
757 		mutex_enter(&cpu_idle_callb_mutex);
758 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
759 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
760 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
761 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
762 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
763 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
764 		mutex_exit(&cpu_idle_callb_mutex);
765 	}
766 
767 	return (0);
768 }
769 
770 /*
771  * Free resources allocated by cpu_idle_init().
772  */
773 static void
774 cpu_idle_fini(cpu_t *cp)
775 {
776 	cpupm_mach_state_t *mach_state =
777 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
778 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
779 	cpu_acpi_cstate_t *cstate;
780 	uint_t	cpu_max_cstates, i;
781 
782 	/*
783 	 * idle cpu points back to the generic one
784 	 */
785 	idle_cpu = CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
786 	disp_enq_thread = non_deep_idle_disp_enq_thread;
787 
788 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
789 	if (cstate) {
790 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
791 
792 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
793 			if (cstate->cs_ksp != NULL)
794 				kstat_delete(cstate->cs_ksp);
795 			cstate++;
796 		}
797 	}
798 
799 	cpupm_free_ms_cstate(cp);
800 	cpupm_free_domains(&cpupm_cstate_domains);
801 	cpu_acpi_free_cstate_data(handle);
802 
803 	mutex_enter(&cpu_idle_callb_mutex);
804 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
805 		(void) callb_delete(cpu_deep_idle_callb_id);
806 		cpu_deep_idle_callb_id = (callb_id_t)0;
807 	}
808 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
809 		(void) callb_delete(cpu_idle_cpr_callb_id);
810 		cpu_idle_cpr_callb_id = (callb_id_t)0;
811 	}
812 	mutex_exit(&cpu_idle_callb_mutex);
813 }
814 
815 /*ARGSUSED*/
816 static boolean_t
817 cpu_deep_idle_callb(void *arg, int code)
818 {
819 	boolean_t rslt = B_TRUE;
820 
821 	mutex_enter(&cpu_idle_callb_mutex);
822 	switch (code) {
823 	case PM_DEFAULT_CPU_DEEP_IDLE:
824 		/*
825 		 * Default policy is same as enable
826 		 */
827 		/*FALLTHROUGH*/
828 	case PM_ENABLE_CPU_DEEP_IDLE:
829 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
830 			break;
831 
832 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
833 			disp_enq_thread = cstate_wakeup;
834 			idle_cpu = cpu_idle_adaptive;
835 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
836 		} else {
837 			rslt = B_FALSE;
838 		}
839 		break;
840 
841 	case PM_DISABLE_CPU_DEEP_IDLE:
842 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
843 			break;
844 
845 		idle_cpu = non_deep_idle_cpu;
846 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
847 			disp_enq_thread = non_deep_idle_disp_enq_thread;
848 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
849 		}
850 		break;
851 
852 	default:
853 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
854 		    code);
855 		break;
856 	}
857 	mutex_exit(&cpu_idle_callb_mutex);
858 	return (rslt);
859 }
860 
861 /*ARGSUSED*/
862 static boolean_t
863 cpu_idle_cpr_callb(void *arg, int code)
864 {
865 	boolean_t rslt = B_TRUE;
866 
867 	mutex_enter(&cpu_idle_callb_mutex);
868 	switch (code) {
869 	case CB_CODE_CPR_RESUME:
870 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
871 			/*
872 			 * Do not enable dispatcher hooks if disabled by user.
873 			 */
874 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
875 				break;
876 
877 			disp_enq_thread = cstate_wakeup;
878 			idle_cpu = cpu_idle_adaptive;
879 		} else {
880 			rslt = B_FALSE;
881 		}
882 		break;
883 
884 	case CB_CODE_CPR_CHKPT:
885 		idle_cpu = non_deep_idle_cpu;
886 		disp_enq_thread = non_deep_idle_disp_enq_thread;
887 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
888 		break;
889 
890 	default:
891 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
892 		break;
893 	}
894 	mutex_exit(&cpu_idle_callb_mutex);
895 	return (rslt);
896 }
897 
898 /*
899  * handle _CST notification
900  */
901 void
902 cpuidle_cstate_instance(cpu_t *cp)
903 {
904 #ifndef	__xpv
905 	cpupm_mach_state_t	*mach_state =
906 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
907 	cpu_acpi_handle_t	handle;
908 	struct machcpu		*mcpu;
909 	cpuset_t 		dom_cpu_set;
910 	kmutex_t		*pm_lock;
911 	int			result = 0;
912 	processorid_t		cpu_id;
913 
914 	if (mach_state == NULL) {
915 		return;
916 	}
917 
918 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
919 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
920 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
921 
922 	/*
923 	 * Do for all the CPU's in the domain
924 	 */
925 	mutex_enter(pm_lock);
926 	do {
927 		CPUSET_FIND(dom_cpu_set, cpu_id);
928 		if (cpu_id == CPUSET_NOTINSET)
929 			break;
930 
931 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
932 		cp = cpu[cpu_id];
933 		mach_state = (cpupm_mach_state_t *)
934 		    cp->cpu_m.mcpu_pm_mach_state;
935 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
936 			mutex_exit(pm_lock);
937 			return;
938 		}
939 		handle = mach_state->ms_acpi_handle;
940 		ASSERT(handle != NULL);
941 
942 		/*
943 		 * re-evaluate cstate object
944 		 */
945 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
946 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
947 			    " object Instance: %d", cpu_id);
948 		}
949 		mutex_enter(&cpu_lock);
950 		mcpu = &(cp->cpu_m);
951 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
952 		if (mcpu->max_cstates > CPU_ACPI_C1) {
953 			(void) cstate_timer_callback(
954 			    CST_EVENT_MULTIPLE_CSTATES);
955 			disp_enq_thread = cstate_wakeup;
956 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
957 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
958 			disp_enq_thread = non_deep_idle_disp_enq_thread;
959 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
960 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
961 		}
962 		mutex_exit(&cpu_lock);
963 
964 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
965 		mutex_exit(pm_lock);
966 	} while (result < 0);
967 #endif
968 }
969 
970 /*
971  * handle the number or the type of available processor power states change
972  */
973 void
974 cpuidle_manage_cstates(void *ctx)
975 {
976 	cpu_t			*cp = ctx;
977 	cpupm_mach_state_t	*mach_state =
978 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
979 	boolean_t		is_ready;
980 
981 	if (mach_state == NULL) {
982 		return;
983 	}
984 
985 	/*
986 	 * We currently refuse to power manage if the CPU is not ready to
987 	 * take cross calls (cross calls fail silently if CPU is not ready
988 	 * for it).
989 	 *
990 	 * Additionally, for x86 platforms we cannot power manage
991 	 * any one instance, until all instances have been initialized.
992 	 * That's because we don't know what the CPU domains look like
993 	 * until all instances have been initialized.
994 	 */
995 	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready();
996 	if (!is_ready)
997 		return;
998 
999 	cpuidle_cstate_instance(cp);
1000 }
1001