xref: /illumos-gate/usr/src/uts/i86pc/os/cpupm/cpu_idle.c (revision 6e6545bfaed3bab9ce836ee82d1abd8f2edba89a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2009-2010, Intel Corporation.
27  * All rights reserved.
28  */
29 /*
30  * Copyright 2019 Joyent, Inc.
31  */
32 
33 #include <sys/x86_archext.h>
34 #include <sys/machsystm.h>
35 #include <sys/x_call.h>
36 #include <sys/stat.h>
37 #include <sys/acpi/acpi.h>
38 #include <sys/acpica.h>
39 #include <sys/cpu_acpi.h>
40 #include <sys/cpu_idle.h>
41 #include <sys/cpupm.h>
42 #include <sys/cpu_event.h>
43 #include <sys/hpet.h>
44 #include <sys/archsystm.h>
45 #include <vm/hat_i86.h>
46 #include <sys/dtrace.h>
47 #include <sys/sdt.h>
48 #include <sys/callb.h>
49 
50 #define	CSTATE_USING_HPET		1
51 #define	CSTATE_USING_LAT		2
52 
53 #define	CPU_IDLE_STOP_TIMEOUT		1000
54 
55 extern void cpu_idle_adaptive(void);
56 extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
57     cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);
58 
59 static int cpu_idle_init(cpu_t *);
60 static void cpu_idle_fini(cpu_t *);
61 static void cpu_idle_stop(cpu_t *);
62 static boolean_t cpu_deep_idle_callb(void *arg, int code);
63 static boolean_t cpu_idle_cpr_callb(void *arg, int code);
64 static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);
65 
66 static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);
67 
68 /*
69  * the flag of always-running local APIC timer.
70  * the flag of HPET Timer use in deep cstate.
71  */
72 static boolean_t cpu_cstate_arat = B_FALSE;
73 static boolean_t cpu_cstate_hpet = B_FALSE;
74 
75 /*
76  * Interfaces for modules implementing Intel's deep c-state.
77  */
78 cpupm_state_ops_t cpu_idle_ops = {
79 	"Generic ACPI C-state Support",
80 	cpu_idle_init,
81 	cpu_idle_fini,
82 	NULL,
83 	cpu_idle_stop
84 };
85 
86 static kmutex_t		cpu_idle_callb_mutex;
87 static callb_id_t	cpu_deep_idle_callb_id;
88 static callb_id_t	cpu_idle_cpr_callb_id;
89 static uint_t		cpu_idle_cfg_state;
90 
91 static kmutex_t cpu_idle_mutex;
92 
93 cpu_idle_kstat_t cpu_idle_kstat = {
94 	{ "address_space_id",	KSTAT_DATA_STRING },
95 	{ "latency",		KSTAT_DATA_UINT32 },
96 	{ "power",		KSTAT_DATA_UINT32 },
97 };
98 
99 /*
100  * kstat update function of the c-state info
101  */
102 static int
103 cpu_idle_kstat_update(kstat_t *ksp, int flag)
104 {
105 	cpu_acpi_cstate_t *cstate = ksp->ks_private;
106 
107 	if (flag == KSTAT_WRITE) {
108 		return (EACCES);
109 	}
110 
111 	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
112 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
113 		"FFixedHW");
114 	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
115 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
116 		"SystemIO");
117 	} else {
118 		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
119 		"Unsupported");
120 	}
121 
122 	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
123 	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;
124 
125 	return (0);
126 }
127 
128 /*
129  * Used during configuration callbacks to manage implementation specific
130  * details of the hardware timer used during Deep C-state.
131  */
132 boolean_t
133 cstate_timer_callback(int code)
134 {
135 	if (cpu_cstate_arat) {
136 		return (B_TRUE);
137 	} else if (cpu_cstate_hpet) {
138 		return (hpet.callback(code));
139 	}
140 	return (B_FALSE);
141 }
142 
143 /*
144  * Some Local APIC Timers do not work during Deep C-states.
145  * The Deep C-state idle function uses this function to ensure it is using a
146  * hardware timer that works during Deep C-states.  This function also
147  * switches the timer back to the LACPI Timer after Deep C-state.
148  */
149 static boolean_t
150 cstate_use_timer(hrtime_t *lapic_expire, int timer)
151 {
152 	if (cpu_cstate_arat)
153 		return (B_TRUE);
154 
155 	/*
156 	 * We have to return B_FALSE if no arat or hpet support
157 	 */
158 	if (!cpu_cstate_hpet)
159 		return (B_FALSE);
160 
161 	switch (timer) {
162 	case CSTATE_USING_HPET:
163 		return (hpet.use_hpet_timer(lapic_expire));
164 	case CSTATE_USING_LAT:
165 		hpet.use_lapic_timer(*lapic_expire);
166 		return (B_TRUE);
167 	default:
168 		return (B_FALSE);
169 	}
170 }
171 
172 /*
173  * c-state wakeup function.
174  * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
175  * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
176  */
177 void
178 cstate_wakeup(cpu_t *cp, int bound)
179 {
180 	struct machcpu	*mcpu = &(cp->cpu_m);
181 	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
182 	cpupart_t	*cpu_part;
183 	uint_t		cpu_found;
184 	processorid_t	cpu_sid;
185 
186 	cpu_part = cp->cpu_part;
187 	cpu_sid = cp->cpu_seqid;
188 	/*
189 	 * Clear the halted bit for that CPU since it will be woken up
190 	 * in a moment.
191 	 */
192 	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
193 		/*
194 		 * Clear the halted bit for that CPU since it will be
195 		 * poked in a moment.
196 		 */
197 		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
198 
199 		/*
200 		 * We may find the current CPU present in the halted cpuset
201 		 * if we're in the context of an interrupt that occurred
202 		 * before we had a chance to clear our bit in cpu_idle().
203 		 * Waking ourself is obviously unnecessary, since if
204 		 * we're here, we're not halted.
205 		 */
206 		if (cp != CPU) {
207 			/*
208 			 * Use correct wakeup mechanism
209 			 */
210 			if ((mcpu_mwait != NULL) &&
211 			    (*mcpu_mwait == MWAIT_HALTED))
212 				MWAIT_WAKEUP(cp);
213 			else
214 				poke_cpu(cp->cpu_id);
215 		}
216 		return;
217 	} else {
218 		/*
219 		 * This cpu isn't halted, but it's idle or undergoing a
220 		 * context switch. No need to awaken anyone else.
221 		 */
222 		if (cp->cpu_thread == cp->cpu_idle_thread ||
223 		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
224 			return;
225 	}
226 
227 	/*
228 	 * No need to wake up other CPUs if the thread we just enqueued
229 	 * is bound.
230 	 */
231 	if (bound)
232 		return;
233 
234 
235 	/*
236 	 * See if there's any other halted CPUs. If there are, then
237 	 * select one, and awaken it.
238 	 * It's possible that after we find a CPU, somebody else
239 	 * will awaken it before we get the chance.
240 	 * In that case, look again.
241 	 */
242 	do {
243 		cpu_found = bitset_find(&cpu_part->cp_haltset);
244 		if (cpu_found == (uint_t)-1)
245 			return;
246 
247 	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
248 	    cpu_found) < 0);
249 
250 	/*
251 	 * Must use correct wakeup mechanism to avoid lost wakeup of
252 	 * alternate cpu.
253 	 */
254 	if (cpu_found != CPU->cpu_seqid) {
255 		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
256 		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
257 			MWAIT_WAKEUP(cpu_seq[cpu_found]);
258 		else
259 			poke_cpu(cpu_seq[cpu_found]->cpu_id);
260 	}
261 }
262 
263 /*
264  * Function called by CPU idle notification framework to check whether CPU
265  * has been awakened. It will be called with interrupt disabled.
266  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
267  * notification framework.
268  */
269 static void
270 acpi_cpu_mwait_check_wakeup(void *arg)
271 {
272 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
273 
274 	ASSERT(arg != NULL);
275 	if (*mcpu_mwait != MWAIT_HALTED) {
276 		/*
277 		 * CPU has been awakened, notify CPU idle notification system.
278 		 */
279 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
280 	} else {
281 		/*
282 		 * Toggle interrupt flag to detect pending interrupts.
283 		 * If interrupt happened, do_interrupt() will notify CPU idle
284 		 * notification framework so no need to call cpu_idle_exit()
285 		 * here.
286 		 */
287 		sti();
288 		SMT_PAUSE();
289 		cli();
290 	}
291 }
292 
293 static void
294 acpi_cpu_mwait_ipi_check_wakeup(void *arg)
295 {
296 	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
297 
298 	ASSERT(arg != NULL);
299 	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
300 		/*
301 		 * CPU has been awakened, notify CPU idle notification system.
302 		 */
303 		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
304 	} else {
305 		/*
306 		 * Toggle interrupt flag to detect pending interrupts.
307 		 * If interrupt happened, do_interrupt() will notify CPU idle
308 		 * notification framework so no need to call cpu_idle_exit()
309 		 * here.
310 		 */
311 		sti();
312 		SMT_PAUSE();
313 		cli();
314 	}
315 }
316 
317 /*ARGSUSED*/
318 static void
319 acpi_cpu_check_wakeup(void *arg)
320 {
321 	/*
322 	 * Toggle interrupt flag to detect pending interrupts.
323 	 * If interrupt happened, do_interrupt() will notify CPU idle
324 	 * notification framework so no need to call cpu_idle_exit() here.
325 	 */
326 	sti();
327 	SMT_PAUSE();
328 	cli();
329 }
330 
331 /*
332  * enter deep c-state handler
333  */
334 static void
335 acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
336 {
337 	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
338 	cpu_t			*cpup = CPU;
339 	processorid_t		cpu_sid = cpup->cpu_seqid;
340 	cpupart_t		*cp = cpup->cpu_part;
341 	hrtime_t		lapic_expire;
342 	uint8_t			type = cstate->cs_addrspace_id;
343 	uint32_t		cs_type = cstate->cs_type;
344 	int			hset_update = 1;
345 	boolean_t		using_timer;
346 	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;
347 
348 	/*
349 	 * Set our mcpu_mwait here, so we can tell if anyone tries to
350 	 * wake us between now and when we call mwait.  No other cpu will
351 	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
352 	 */
353 	if (mcpu_mwait) {
354 		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
355 			*mcpu_mwait = MWAIT_WAKEUP_IPI;
356 			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
357 		} else {
358 			*mcpu_mwait = MWAIT_HALTED;
359 			check_func = &acpi_cpu_mwait_check_wakeup;
360 		}
361 	}
362 
363 	/*
364 	 * If this CPU is online, and there are multiple CPUs
365 	 * in the system, then we should note our halting
366 	 * by adding ourselves to the partition's halted CPU
367 	 * bitmap. This allows other CPUs to find/awaken us when
368 	 * work becomes available.
369 	 */
370 	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
371 		hset_update = 0;
372 
373 	/*
374 	 * Add ourselves to the partition's halted CPUs bitmask
375 	 * and set our HALTED flag, if necessary.
376 	 *
377 	 * When a thread becomes runnable, it is placed on the queue
378 	 * and then the halted cpuset is checked to determine who
379 	 * (if anyone) should be awakened. We therefore need to first
380 	 * add ourselves to the halted cpuset, and and then check if there
381 	 * is any work available.
382 	 *
383 	 * Note that memory barriers after updating the HALTED flag
384 	 * are not necessary since an atomic operation (updating the bitmap)
385 	 * immediately follows. On x86 the atomic operation acts as a
386 	 * memory barrier for the update of cpu_disp_flags.
387 	 */
388 	if (hset_update) {
389 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
390 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
391 	}
392 
393 	/*
394 	 * Check to make sure there's really nothing to do.
395 	 * Work destined for this CPU may become available after
396 	 * this check. We'll be notified through the clearing of our
397 	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
398 	 *
399 	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
400 	 */
401 	if (disp_anywork()) {
402 		if (hset_update) {
403 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
404 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
405 		}
406 		return;
407 	}
408 
409 	/*
410 	 * We're on our way to being halted.
411 	 *
412 	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
413 	 * Try to program the HPET hardware to substitute for this CPU's
414 	 * LAPIC timer.
415 	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
416 	 * to start the LAPIC Timer again before leaving this function.
417 	 *
418 	 * Disable interrupts here so we will awaken immediately after halting
419 	 * if someone tries to poke us between now and the time we actually
420 	 * halt.
421 	 */
422 	cli();
423 	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);
424 
425 	/*
426 	 * We check for the presence of our bit after disabling interrupts.
427 	 * If it's cleared, we'll return. If the bit is cleared after
428 	 * we check then the cstate_wakeup() will pop us out of the halted
429 	 * state.
430 	 *
431 	 * This means that the ordering of the cstate_wakeup() and the clearing
432 	 * of the bit by cpu_wakeup is important.
433 	 * cpu_wakeup() must clear our mc_haltset bit, and then call
434 	 * cstate_wakeup().
435 	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
436 	 */
437 	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
438 		(void) cstate_use_timer(&lapic_expire,
439 		    CSTATE_USING_LAT);
440 		sti();
441 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
442 		return;
443 	}
444 
445 	/*
446 	 * The check for anything locally runnable is here for performance
447 	 * and isn't needed for correctness. disp_nrunnable ought to be
448 	 * in our cache still, so it's inexpensive to check, and if there
449 	 * is anything runnable we won't have to wait for the poke.
450 	 */
451 	if (cpup->cpu_disp->disp_nrunnable != 0) {
452 		(void) cstate_use_timer(&lapic_expire,
453 		    CSTATE_USING_LAT);
454 		sti();
455 		if (hset_update) {
456 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
457 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
458 		}
459 		return;
460 	}
461 
462 	if (using_timer == B_FALSE) {
463 
464 		(void) cstate_use_timer(&lapic_expire,
465 		    CSTATE_USING_LAT);
466 		sti();
467 
468 		/*
469 		 * We are currently unable to program the HPET to act as this
470 		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
471 		 * because no timer is set to wake it up while its LAPIC timer
472 		 * stalls in deep C-States.
473 		 * Enter C1 instead.
474 		 *
475 		 * cstate_wake_cpu() will wake this CPU with an IPI which
476 		 * works with MWAIT.
477 		 */
478 		i86_monitor(mcpu_mwait, 0, 0);
479 		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
480 			if (cpu_idle_enter(IDLE_STATE_C1, 0,
481 			    check_func, (void *)mcpu_mwait) == 0) {
482 				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
483 				    MWAIT_HALTED) {
484 					i86_mwait(0, 0);
485 				}
486 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
487 			}
488 		}
489 
490 		/*
491 		 * We're no longer halted
492 		 */
493 		if (hset_update) {
494 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
495 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
496 		}
497 		return;
498 	}
499 
500 	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
501 		/*
502 		 * We're on our way to being halted.
503 		 * To avoid a lost wakeup, arm the monitor before checking
504 		 * if another cpu wrote to mcpu_mwait to wake us up.
505 		 */
506 		i86_monitor(mcpu_mwait, 0, 0);
507 		if (*mcpu_mwait == MWAIT_HALTED) {
508 			if (cpu_idle_enter((uint_t)cs_type, 0,
509 			    check_func, (void *)mcpu_mwait) == 0) {
510 				if (*mcpu_mwait == MWAIT_HALTED) {
511 					i86_mwait(cstate->cs_address, 1);
512 				}
513 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
514 			}
515 		}
516 	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
517 		uint32_t value;
518 		ACPI_TABLE_FADT *gbl_FADT;
519 
520 		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
521 			if (cpu_idle_enter((uint_t)cs_type, 0,
522 			    check_func, (void *)mcpu_mwait) == 0) {
523 				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
524 					/*
525 					 * The following calls will cause us to
526 					 * halt which will cause the store
527 					 * buffer to be repartitioned,
528 					 * potentially exposing us to the Intel
529 					 * CPU vulnerability MDS. As such, we
530 					 * need to explicitly call that here.
531 					 * The other idle methods in this
532 					 * function do this automatically as
533 					 * part of the implementation of
534 					 * i86_mwait().
535 					 */
536 					x86_md_clear();
537 					(void) cpu_acpi_read_port(
538 					    cstate->cs_address, &value, 8);
539 					acpica_get_global_FADT(&gbl_FADT);
540 					(void) cpu_acpi_read_port(
541 					    gbl_FADT->XPmTimerBlock.Address,
542 					    &value, 32);
543 				}
544 				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
545 			}
546 		}
547 	}
548 
549 	/*
550 	 * The LAPIC timer may have stopped in deep c-state.
551 	 * Reprogram this CPU's LAPIC here before enabling interrupts.
552 	 */
553 	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
554 	sti();
555 
556 	/*
557 	 * We're no longer halted
558 	 */
559 	if (hset_update) {
560 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
561 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
562 	}
563 }
564 
565 /*
566  * Idle the present CPU, deep c-state is supported
567  */
568 void
569 cpu_acpi_idle(void)
570 {
571 	cpu_t *cp = CPU;
572 	cpu_acpi_handle_t handle;
573 	cma_c_state_t *cs_data;
574 	cpu_acpi_cstate_t *cstates;
575 	hrtime_t start, end;
576 	int cpu_max_cstates;
577 	uint32_t cs_indx;
578 	uint16_t cs_type;
579 
580 	cpupm_mach_state_t *mach_state =
581 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
582 	handle = mach_state->ms_acpi_handle;
583 	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);
584 
585 	cs_data = mach_state->ms_cstate.cma_state.cstate;
586 	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
587 	ASSERT(cstates != NULL);
588 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
589 	if (cpu_max_cstates > CPU_MAX_CSTATES)
590 		cpu_max_cstates = CPU_MAX_CSTATES;
591 	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
592 		(*non_deep_idle_cpu)();
593 		return;
594 	}
595 
596 	start = gethrtime_unscaled();
597 
598 	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);
599 
600 	cs_type = cstates[cs_indx].cs_type;
601 
602 	switch (cs_type) {
603 	default:
604 		/* FALLTHROUGH */
605 	case CPU_ACPI_C1:
606 		(*non_deep_idle_cpu)();
607 		break;
608 
609 	case CPU_ACPI_C2:
610 		acpi_cpu_cstate(&cstates[cs_indx]);
611 		break;
612 
613 	case CPU_ACPI_C3:
614 		/*
615 		 * All supported Intel processors maintain cache coherency
616 		 * during C3.  Currently when entering C3 processors flush
617 		 * core caches to higher level shared cache. The shared cache
618 		 * maintains state and supports probes during C3.
619 		 * Consequently there is no need to handle cache coherency
620 		 * and Bus Master activity here with the cache flush, BM_RLD
621 		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
622 		 * in section 8.1.4 of the ACPI Specification 4.0.
623 		 */
624 		acpi_cpu_cstate(&cstates[cs_indx]);
625 		break;
626 	}
627 
628 	end = gethrtime_unscaled();
629 
630 	/*
631 	 * Update statistics
632 	 */
633 	cpupm_wakeup_cstate_data(cs_data, end);
634 }
635 
636 boolean_t
637 cpu_deep_cstates_supported(void)
638 {
639 	extern int	idle_cpu_no_deep_c;
640 
641 	if (idle_cpu_no_deep_c)
642 		return (B_FALSE);
643 
644 	if (!cpuid_deep_cstates_supported())
645 		return (B_FALSE);
646 
647 	if (cpuid_arat_supported()) {
648 		cpu_cstate_arat = B_TRUE;
649 		return (B_TRUE);
650 	}
651 
652 	if ((hpet.supported == HPET_FULL_SUPPORT) &&
653 	    hpet.install_proxy()) {
654 		cpu_cstate_hpet = B_TRUE;
655 		return (B_TRUE);
656 	}
657 
658 	return (B_FALSE);
659 }
660 
661 /*
662  * Validate that this processor supports deep cstate and if so,
663  * get the c-state data from ACPI and cache it.
664  */
665 static int
666 cpu_idle_init(cpu_t *cp)
667 {
668 	cpupm_mach_state_t *mach_state =
669 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
670 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
671 	cpu_acpi_cstate_t *cstate;
672 	char name[KSTAT_STRLEN];
673 	int cpu_max_cstates, i;
674 	int ret;
675 
676 	/*
677 	 * Cache the C-state specific ACPI data.
678 	 */
679 	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
680 		if (ret < 0)
681 			cmn_err(CE_NOTE,
682 			    "!Support for CPU deep idle states is being "
683 			    "disabled due to errors parsing ACPI C-state "
684 			    "objects exported by BIOS.");
685 		cpu_idle_fini(cp);
686 		return (-1);
687 	}
688 
689 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
690 
691 	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
692 
693 	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
694 		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
695 		/*
696 		 * Allocate, initialize and install cstate kstat
697 		 */
698 		cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
699 		    name, "misc",
700 		    KSTAT_TYPE_NAMED,
701 		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
702 		    KSTAT_FLAG_VIRTUAL);
703 
704 		if (cstate->cs_ksp == NULL) {
705 			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
706 		} else {
707 			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
708 			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
709 			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
710 			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
711 			cstate->cs_ksp->ks_private = cstate;
712 			kstat_install(cstate->cs_ksp);
713 		}
714 		cstate++;
715 	}
716 
717 	cpupm_alloc_domains(cp, CPUPM_C_STATES);
718 	cpupm_alloc_ms_cstate(cp);
719 
720 	if (cpu_deep_cstates_supported()) {
721 		uint32_t value;
722 
723 		mutex_enter(&cpu_idle_callb_mutex);
724 		if (cpu_deep_idle_callb_id == (callb_id_t)0)
725 			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
726 			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
727 		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
728 			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
729 			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
730 		mutex_exit(&cpu_idle_callb_mutex);
731 
732 
733 		/*
734 		 * All supported CPUs (Nehalem and later) will remain in C3
735 		 * during Bus Master activity.
736 		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
737 		 * is not already 0 before enabling Deeper C-states.
738 		 */
739 		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
740 		if (value & 1)
741 			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
742 	}
743 
744 	return (0);
745 }
746 
747 /*
748  * Free resources allocated by cpu_idle_init().
749  */
750 static void
751 cpu_idle_fini(cpu_t *cp)
752 {
753 	cpupm_mach_state_t *mach_state =
754 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
755 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
756 	cpu_acpi_cstate_t *cstate;
757 	uint_t	cpu_max_cstates, i;
758 
759 	/*
760 	 * idle cpu points back to the generic one
761 	 */
762 	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
763 	disp_enq_thread = non_deep_idle_disp_enq_thread;
764 
765 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
766 	if (cstate) {
767 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
768 
769 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
770 			if (cstate->cs_ksp != NULL)
771 				kstat_delete(cstate->cs_ksp);
772 			cstate++;
773 		}
774 	}
775 
776 	cpupm_free_ms_cstate(cp);
777 	cpupm_free_domains(&cpupm_cstate_domains);
778 	cpu_acpi_free_cstate_data(handle);
779 
780 	mutex_enter(&cpu_idle_callb_mutex);
781 	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
782 		(void) callb_delete(cpu_deep_idle_callb_id);
783 		cpu_deep_idle_callb_id = (callb_id_t)0;
784 	}
785 	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
786 		(void) callb_delete(cpu_idle_cpr_callb_id);
787 		cpu_idle_cpr_callb_id = (callb_id_t)0;
788 	}
789 	mutex_exit(&cpu_idle_callb_mutex);
790 }
791 
792 /*
793  * This function is introduced here to solve a race condition
794  * between the master and the slave to touch c-state data structure.
795  * After the slave calls this idle function to switch to the non
796  * deep idle function, the master can go on to reclaim the resource.
797  */
798 static void
799 cpu_idle_stop_sync(void)
800 {
801 	/* switch to the non deep idle function */
802 	CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
803 }
804 
805 static void
806 cpu_idle_stop(cpu_t *cp)
807 {
808 	cpupm_mach_state_t *mach_state =
809 	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
810 	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
811 	cpu_acpi_cstate_t *cstate;
812 	uint_t cpu_max_cstates, i = 0;
813 
814 	mutex_enter(&cpu_idle_callb_mutex);
815 	if (idle_cpu == cpu_idle_adaptive) {
816 		/*
817 		 * invoke the slave to call synchronous idle function.
818 		 */
819 		cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
820 		poke_cpu(cp->cpu_id);
821 
822 		/*
823 		 * wait until the slave switchs to non deep idle function,
824 		 * so that the master is safe to go on to reclaim the resource.
825 		 */
826 		while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
827 			drv_usecwait(10);
828 			if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
829 				cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
830 				    " idle stop timeout");
831 		}
832 	}
833 	mutex_exit(&cpu_idle_callb_mutex);
834 
835 	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
836 	if (cstate) {
837 		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
838 
839 		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
840 			if (cstate->cs_ksp != NULL)
841 				kstat_delete(cstate->cs_ksp);
842 			cstate++;
843 		}
844 	}
845 	cpupm_free_ms_cstate(cp);
846 	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
847 	cpu_acpi_free_cstate_data(handle);
848 }
849 
850 /*ARGSUSED*/
851 static boolean_t
852 cpu_deep_idle_callb(void *arg, int code)
853 {
854 	boolean_t rslt = B_TRUE;
855 
856 	mutex_enter(&cpu_idle_callb_mutex);
857 	switch (code) {
858 	case PM_DEFAULT_CPU_DEEP_IDLE:
859 		/*
860 		 * Default policy is same as enable
861 		 */
862 		/*FALLTHROUGH*/
863 	case PM_ENABLE_CPU_DEEP_IDLE:
864 		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
865 			break;
866 
867 		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
868 			disp_enq_thread = cstate_wakeup;
869 			idle_cpu = cpu_idle_adaptive;
870 			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
871 		} else {
872 			rslt = B_FALSE;
873 		}
874 		break;
875 
876 	case PM_DISABLE_CPU_DEEP_IDLE:
877 		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
878 			break;
879 
880 		idle_cpu = non_deep_idle_cpu;
881 		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
882 			disp_enq_thread = non_deep_idle_disp_enq_thread;
883 			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
884 		}
885 		break;
886 
887 	default:
888 		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
889 		    code);
890 		break;
891 	}
892 	mutex_exit(&cpu_idle_callb_mutex);
893 	return (rslt);
894 }
895 
896 /*ARGSUSED*/
897 static boolean_t
898 cpu_idle_cpr_callb(void *arg, int code)
899 {
900 	boolean_t rslt = B_TRUE;
901 
902 	mutex_enter(&cpu_idle_callb_mutex);
903 	switch (code) {
904 	case CB_CODE_CPR_RESUME:
905 		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
906 			/*
907 			 * Do not enable dispatcher hooks if disabled by user.
908 			 */
909 			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
910 				break;
911 
912 			disp_enq_thread = cstate_wakeup;
913 			idle_cpu = cpu_idle_adaptive;
914 		} else {
915 			rslt = B_FALSE;
916 		}
917 		break;
918 
919 	case CB_CODE_CPR_CHKPT:
920 		idle_cpu = non_deep_idle_cpu;
921 		disp_enq_thread = non_deep_idle_disp_enq_thread;
922 		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
923 		break;
924 
925 	default:
926 		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
927 		break;
928 	}
929 	mutex_exit(&cpu_idle_callb_mutex);
930 	return (rslt);
931 }
932 
933 /*
934  * handle _CST notification
935  */
936 void
937 cpuidle_cstate_instance(cpu_t *cp)
938 {
939 #ifndef	__xpv
940 	cpupm_mach_state_t	*mach_state =
941 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
942 	cpu_acpi_handle_t	handle;
943 	struct machcpu		*mcpu;
944 	cpuset_t 		dom_cpu_set;
945 	kmutex_t		*pm_lock;
946 	int			result = 0;
947 	processorid_t		cpu_id;
948 
949 	if (mach_state == NULL) {
950 		return;
951 	}
952 
953 	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
954 	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
955 	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;
956 
957 	/*
958 	 * Do for all the CPU's in the domain
959 	 */
960 	mutex_enter(pm_lock);
961 	do {
962 		CPUSET_FIND(dom_cpu_set, cpu_id);
963 		if (cpu_id == CPUSET_NOTINSET)
964 			break;
965 
966 		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
967 		cp = cpu[cpu_id];
968 		mach_state = (cpupm_mach_state_t *)
969 		    cp->cpu_m.mcpu_pm_mach_state;
970 		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
971 			mutex_exit(pm_lock);
972 			return;
973 		}
974 		handle = mach_state->ms_acpi_handle;
975 		ASSERT(handle != NULL);
976 
977 		/*
978 		 * re-evaluate cstate object
979 		 */
980 		if (cpu_acpi_cache_cstate_data(handle) != 0) {
981 			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
982 			    " object Instance: %d", cpu_id);
983 		}
984 		mcpu = &(cp->cpu_m);
985 		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
986 		if (mcpu->max_cstates > CPU_ACPI_C1) {
987 			(void) cstate_timer_callback(
988 			    CST_EVENT_MULTIPLE_CSTATES);
989 			disp_enq_thread = cstate_wakeup;
990 			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
991 		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
992 			disp_enq_thread = non_deep_idle_disp_enq_thread;
993 			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
994 			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
995 		}
996 
997 		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
998 	} while (result < 0);
999 	mutex_exit(pm_lock);
1000 #endif
1001 }
1002 
1003 /*
1004  * handle the number or the type of available processor power states change
1005  */
1006 void
1007 cpuidle_manage_cstates(void *ctx)
1008 {
1009 	cpu_t			*cp = ctx;
1010 	cpupm_mach_state_t	*mach_state =
1011 	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
1012 	boolean_t		is_ready;
1013 
1014 	if (mach_state == NULL) {
1015 		return;
1016 	}
1017 
1018 	/*
1019 	 * We currently refuse to power manage if the CPU is not ready to
1020 	 * take cross calls (cross calls fail silently if CPU is not ready
1021 	 * for it).
1022 	 *
1023 	 * Additionally, for x86 platforms we cannot power manage an instance,
1024 	 * until it has been initialized.
1025 	 */
1026 	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
1027 	if (!is_ready)
1028 		return;
1029 
1030 	cpuidle_cstate_instance(cp);
1031 }
1032