/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright (c) 2009-2010, Intel Corporation.
 * All rights reserved.
 */

#include <sys/x86_archext.h>
#include <sys/machsystm.h>
#include <sys/x_call.h>
#include <sys/stat.h>
#include <sys/acpi/acpi.h>
#include <sys/acpica.h>
#include <sys/cpu_acpi.h>
#include <sys/cpu_idle.h>
#include <sys/cpupm.h>
#include <sys/cpu_event.h>
#include <sys/hpet.h>
#include <sys/archsystm.h>
#include <vm/hat_i86.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
#include <sys/callb.h>

#define	CSTATE_USING_HPET		1
#define	CSTATE_USING_LAT		2

#define	CPU_IDLE_STOP_TIMEOUT		1000

extern void cpu_idle_adaptive(void);
extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
    cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);

static int cpu_idle_init(cpu_t *);
static void cpu_idle_fini(cpu_t *);
static void cpu_idle_stop(cpu_t *);
static boolean_t cpu_deep_idle_callb(void *arg, int code);
static boolean_t cpu_idle_cpr_callb(void *arg, int code);
static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);

static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);

/*
 * the flag of always-running local APIC timer.
 * the flag of HPET Timer use in deep cstate.
 */
static boolean_t cpu_cstate_arat = B_FALSE;
static boolean_t cpu_cstate_hpet = B_FALSE;

/*
 * Interfaces for modules implementing Intel's deep c-state.
 */
cpupm_state_ops_t cpu_idle_ops = {
	"Generic ACPI C-state Support",
	cpu_idle_init,
	cpu_idle_fini,
	NULL,
	cpu_idle_stop
};

static kmutex_t		cpu_idle_callb_mutex;
static callb_id_t	cpu_deep_idle_callb_id;
static callb_id_t	cpu_idle_cpr_callb_id;
static uint_t		cpu_idle_cfg_state;

static kmutex_t cpu_idle_mutex;

cpu_idle_kstat_t cpu_idle_kstat = {
	{ "address_space_id",	KSTAT_DATA_STRING },
	{ "latency",		KSTAT_DATA_UINT32 },
	{ "power",		KSTAT_DATA_UINT32 },
};

/*
 * kstat update function of the c-state info
 */
static int
cpu_idle_kstat_update(kstat_t *ksp, int flag)
{
	cpu_acpi_cstate_t *cstate = ksp->ks_private;

	if (flag == KSTAT_WRITE) {
		return (EACCES);
	}

	if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
		"FFixedHW");
	} else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
		"SystemIO");
	} else {
		kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
		"Unsupported");
	}

	cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
	cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;

	return (0);
}

/*
 * Used during configuration callbacks to manage implementation specific
 * details of the hardware timer used during Deep C-state.
 */
boolean_t
cstate_timer_callback(int code)
{
	if (cpu_cstate_arat) {
		return (B_TRUE);
	} else if (cpu_cstate_hpet) {
		return (hpet.callback(code));
	}
	return (B_FALSE);
}

/*
 * Some Local APIC Timers do not work during Deep C-states.
 * The Deep C-state idle function uses this function to ensure it is using a
 * hardware timer that works during Deep C-states.  This function also
 * switches the timer back to the LACPI Timer after Deep C-state.
 */
static boolean_t
cstate_use_timer(hrtime_t *lapic_expire, int timer)
{
	if (cpu_cstate_arat)
		return (B_TRUE);

	/*
	 * We have to return B_FALSE if no arat or hpet support
	 */
	if (!cpu_cstate_hpet)
		return (B_FALSE);

	switch (timer) {
	case CSTATE_USING_HPET:
		return (hpet.use_hpet_timer(lapic_expire));
	case CSTATE_USING_LAT:
		hpet.use_lapic_timer(*lapic_expire);
		return (B_TRUE);
	default:
		return (B_FALSE);
	}
}

/*
 * c-state wakeup function.
 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
 */
void
cstate_wakeup(cpu_t *cp, int bound)
{
	struct machcpu	*mcpu = &(cp->cpu_m);
	volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
	cpupart_t	*cpu_part;
	uint_t		cpu_found;
	processorid_t	cpu_sid;

	cpu_part = cp->cpu_part;
	cpu_sid = cp->cpu_seqid;
	/*
	 * Clear the halted bit for that CPU since it will be woken up
	 * in a moment.
	 */
	if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
		/*
		 * Clear the halted bit for that CPU since it will be
		 * poked in a moment.
		 */
		bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);

		/*
		 * We may find the current CPU present in the halted cpuset
		 * if we're in the context of an interrupt that occurred
		 * before we had a chance to clear our bit in cpu_idle().
		 * Waking ourself is obviously unnecessary, since if
		 * we're here, we're not halted.
		 */
		if (cp != CPU) {
			/*
			 * Use correct wakeup mechanism
			 */
			if ((mcpu_mwait != NULL) &&
			    (*mcpu_mwait == MWAIT_HALTED))
				MWAIT_WAKEUP(cp);
			else
				poke_cpu(cp->cpu_id);
		}
		return;
	} else {
		/*
		 * This cpu isn't halted, but it's idle or undergoing a
		 * context switch. No need to awaken anyone else.
		 */
		if (cp->cpu_thread == cp->cpu_idle_thread ||
		    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
			return;
	}

	/*
	 * No need to wake up other CPUs if the thread we just enqueued
	 * is bound.
	 */
	if (bound)
		return;


	/*
	 * See if there's any other halted CPUs. If there are, then
	 * select one, and awaken it.
	 * It's possible that after we find a CPU, somebody else
	 * will awaken it before we get the chance.
	 * In that case, look again.
	 */
	do {
		cpu_found = bitset_find(&cpu_part->cp_haltset);
		if (cpu_found == (uint_t)-1)
			return;

	} while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
	    cpu_found) < 0);

	/*
	 * Must use correct wakeup mechanism to avoid lost wakeup of
	 * alternate cpu.
	 */
	if (cpu_found != CPU->cpu_seqid) {
		mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
		if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
			MWAIT_WAKEUP(cpu_seq[cpu_found]);
		else
			poke_cpu(cpu_seq[cpu_found]->cpu_id);
	}
}

/*
 * Function called by CPU idle notification framework to check whether CPU
 * has been awakened. It will be called with interrupt disabled.
 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 * notification framework.
 */
static void
acpi_cpu_mwait_check_wakeup(void *arg)
{
	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;

	ASSERT(arg != NULL);
	if (*mcpu_mwait != MWAIT_HALTED) {
		/*
		 * CPU has been awakened, notify CPU idle notification system.
		 */
		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
	} else {
		/*
		 * Toggle interrupt flag to detect pending interrupts.
		 * If interrupt happened, do_interrupt() will notify CPU idle
		 * notification framework so no need to call cpu_idle_exit()
		 * here.
		 */
		sti();
		SMT_PAUSE();
		cli();
	}
}

static void
acpi_cpu_mwait_ipi_check_wakeup(void *arg)
{
	volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;

	ASSERT(arg != NULL);
	if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
		/*
		 * CPU has been awakened, notify CPU idle notification system.
		 */
		cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
	} else {
		/*
		 * Toggle interrupt flag to detect pending interrupts.
		 * If interrupt happened, do_interrupt() will notify CPU idle
		 * notification framework so no need to call cpu_idle_exit()
		 * here.
		 */
		sti();
		SMT_PAUSE();
		cli();
	}
}

/*ARGSUSED*/
static void
acpi_cpu_check_wakeup(void *arg)
{
	/*
	 * Toggle interrupt flag to detect pending interrupts.
	 * If interrupt happened, do_interrupt() will notify CPU idle
	 * notification framework so no need to call cpu_idle_exit() here.
	 */
	sti();
	SMT_PAUSE();
	cli();
}

/*
 * enter deep c-state handler
 */
static void
acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
{
	volatile uint32_t	*mcpu_mwait = CPU->cpu_m.mcpu_mwait;
	cpu_t			*cpup = CPU;
	processorid_t		cpu_sid = cpup->cpu_seqid;
	cpupart_t		*cp = cpup->cpu_part;
	hrtime_t		lapic_expire;
	uint8_t			type = cstate->cs_addrspace_id;
	uint32_t		cs_type = cstate->cs_type;
	int			hset_update = 1;
	boolean_t		using_timer;
	cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;

	/*
	 * Set our mcpu_mwait here, so we can tell if anyone tries to
	 * wake us between now and when we call mwait.  No other cpu will
	 * attempt to set our mcpu_mwait until we add ourself to the haltset.
	 */
	if (mcpu_mwait) {
		if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
			*mcpu_mwait = MWAIT_WAKEUP_IPI;
			check_func = &acpi_cpu_mwait_ipi_check_wakeup;
		} else {
			*mcpu_mwait = MWAIT_HALTED;
			check_func = &acpi_cpu_mwait_check_wakeup;
		}
	}

	/*
	 * If this CPU is online, and there are multiple CPUs
	 * in the system, then we should note our halting
	 * by adding ourselves to the partition's halted CPU
	 * bitmap. This allows other CPUs to find/awaken us when
	 * work becomes available.
	 */
	if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
		hset_update = 0;

	/*
	 * Add ourselves to the partition's halted CPUs bitmask
	 * and set our HALTED flag, if necessary.
	 *
	 * When a thread becomes runnable, it is placed on the queue
	 * and then the halted cpuset is checked to determine who
	 * (if anyone) should be awakened. We therefore need to first
	 * add ourselves to the halted cpuset, and and then check if there
	 * is any work available.
	 *
	 * Note that memory barriers after updating the HALTED flag
	 * are not necessary since an atomic operation (updating the bitmap)
	 * immediately follows. On x86 the atomic operation acts as a
	 * memory barrier for the update of cpu_disp_flags.
	 */
	if (hset_update) {
		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
	}

	/*
	 * Check to make sure there's really nothing to do.
	 * Work destined for this CPU may become available after
	 * this check. We'll be notified through the clearing of our
	 * bit in the halted CPU bitmask, and a write to our mcpu_mwait.
	 *
	 * disp_anywork() checks disp_nrunnable, so we do not have to later.
	 */
	if (disp_anywork()) {
		if (hset_update) {
			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
		}
		return;
	}

	/*
	 * We're on our way to being halted.
	 *
	 * The local APIC timer can stop in ACPI C2 and deeper c-states.
	 * Try to program the HPET hardware to substitute for this CPU's
	 * LAPIC timer.
	 * cstate_use_timer() could disable the LAPIC Timer.  Make sure
	 * to start the LAPIC Timer again before leaving this function.
	 *
	 * Disable interrupts here so we will awaken immediately after halting
	 * if someone tries to poke us between now and the time we actually
	 * halt.
	 */
	cli();
	using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);

	/*
	 * We check for the presence of our bit after disabling interrupts.
	 * If it's cleared, we'll return. If the bit is cleared after
	 * we check then the cstate_wakeup() will pop us out of the halted
	 * state.
	 *
	 * This means that the ordering of the cstate_wakeup() and the clearing
	 * of the bit by cpu_wakeup is important.
	 * cpu_wakeup() must clear our mc_haltset bit, and then call
	 * cstate_wakeup().
	 * acpi_cpu_cstate() must disable interrupts, then check for the bit.
	 */
	if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
		(void) cstate_use_timer(&lapic_expire,
		    CSTATE_USING_LAT);
		sti();
		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
		return;
	}

	/*
	 * The check for anything locally runnable is here for performance
	 * and isn't needed for correctness. disp_nrunnable ought to be
	 * in our cache still, so it's inexpensive to check, and if there
	 * is anything runnable we won't have to wait for the poke.
	 */
	if (cpup->cpu_disp->disp_nrunnable != 0) {
		(void) cstate_use_timer(&lapic_expire,
		    CSTATE_USING_LAT);
		sti();
		if (hset_update) {
			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
		}
		return;
	}

	if (using_timer == B_FALSE) {

		(void) cstate_use_timer(&lapic_expire,
		    CSTATE_USING_LAT);
		sti();

		/*
		 * We are currently unable to program the HPET to act as this
		 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
		 * because no timer is set to wake it up while its LAPIC timer
		 * stalls in deep C-States.
		 * Enter C1 instead.
		 *
		 * cstate_wake_cpu() will wake this CPU with an IPI which
		 * works with MWAIT.
		 */
		i86_monitor(mcpu_mwait, 0, 0);
		if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) == MWAIT_HALTED) {
			if (cpu_idle_enter(IDLE_STATE_C1, 0,
			    check_func, (void *)mcpu_mwait) == 0) {
				if ((*mcpu_mwait & ~MWAIT_WAKEUP_IPI) ==
				    MWAIT_HALTED) {
					i86_mwait(0, 0);
				}
				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
			}
		}

		/*
		 * We're no longer halted
		 */
		if (hset_update) {
			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
		}
		return;
	}

	if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
		/*
		 * We're on our way to being halted.
		 * To avoid a lost wakeup, arm the monitor before checking
		 * if another cpu wrote to mcpu_mwait to wake us up.
		 */
		i86_monitor(mcpu_mwait, 0, 0);
		if (*mcpu_mwait == MWAIT_HALTED) {
			if (cpu_idle_enter((uint_t)cs_type, 0,
			    check_func, (void *)mcpu_mwait) == 0) {
				if (*mcpu_mwait == MWAIT_HALTED) {
					i86_mwait(cstate->cs_address, 1);
				}
				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
			}
		}
	} else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
		uint32_t value;
		ACPI_TABLE_FADT *gbl_FADT;

		if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
			if (cpu_idle_enter((uint_t)cs_type, 0,
			    check_func, (void *)mcpu_mwait) == 0) {
				if (*mcpu_mwait == MWAIT_WAKEUP_IPI) {
					(void) cpu_acpi_read_port(
					    cstate->cs_address, &value, 8);
					acpica_get_global_FADT(&gbl_FADT);
					(void) cpu_acpi_read_port(
					    gbl_FADT->XPmTimerBlock.Address,
					    &value, 32);
				}
				cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
			}
		}
	}

	/*
	 * The LAPIC timer may have stopped in deep c-state.
	 * Reprogram this CPU's LAPIC here before enabling interrupts.
	 */
	(void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
	sti();

	/*
	 * We're no longer halted
	 */
	if (hset_update) {
		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
	}
}

/*
 * Idle the present CPU, deep c-state is supported
 */
void
cpu_acpi_idle(void)
{
	cpu_t *cp = CPU;
	cpu_acpi_handle_t handle;
	cma_c_state_t *cs_data;
	cpu_acpi_cstate_t *cstates;
	hrtime_t start, end;
	int cpu_max_cstates;
	uint32_t cs_indx;
	uint16_t cs_type;

	cpupm_mach_state_t *mach_state =
	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
	handle = mach_state->ms_acpi_handle;
	ASSERT(CPU_ACPI_CSTATES(handle) != NULL);

	cs_data = mach_state->ms_cstate.cma_state.cstate;
	cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
	ASSERT(cstates != NULL);
	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
	if (cpu_max_cstates > CPU_MAX_CSTATES)
		cpu_max_cstates = CPU_MAX_CSTATES;
	if (cpu_max_cstates == 1) {	/* no ACPI c-state data */
		(*non_deep_idle_cpu)();
		return;
	}

	start = gethrtime_unscaled();

	cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);

	cs_type = cstates[cs_indx].cs_type;

	switch (cs_type) {
	default:
		/* FALLTHROUGH */
	case CPU_ACPI_C1:
		(*non_deep_idle_cpu)();
		break;

	case CPU_ACPI_C2:
		acpi_cpu_cstate(&cstates[cs_indx]);
		break;

	case CPU_ACPI_C3:
		/*
		 * All supported Intel processors maintain cache coherency
		 * during C3.  Currently when entering C3 processors flush
		 * core caches to higher level shared cache. The shared cache
		 * maintains state and supports probes during C3.
		 * Consequently there is no need to handle cache coherency
		 * and Bus Master activity here with the cache flush, BM_RLD
		 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
		 * in section 8.1.4 of the ACPI Specification 4.0.
		 */
		acpi_cpu_cstate(&cstates[cs_indx]);
		break;
	}

	end = gethrtime_unscaled();

	/*
	 * Update statistics
	 */
	cpupm_wakeup_cstate_data(cs_data, end);
}

boolean_t
cpu_deep_cstates_supported(void)
{
	extern int	idle_cpu_no_deep_c;

	if (idle_cpu_no_deep_c)
		return (B_FALSE);

	if (!cpuid_deep_cstates_supported())
		return (B_FALSE);

	if (cpuid_arat_supported()) {
		cpu_cstate_arat = B_TRUE;
		return (B_TRUE);
	}

	if ((hpet.supported == HPET_FULL_SUPPORT) &&
	    hpet.install_proxy()) {
		cpu_cstate_hpet = B_TRUE;
		return (B_TRUE);
	}

	return (B_FALSE);
}

/*
 * Validate that this processor supports deep cstate and if so,
 * get the c-state data from ACPI and cache it.
 */
static int
cpu_idle_init(cpu_t *cp)
{
	cpupm_mach_state_t *mach_state =
	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
	cpu_acpi_cstate_t *cstate;
	char name[KSTAT_STRLEN];
	int cpu_max_cstates, i;
	int ret;

	/*
	 * Cache the C-state specific ACPI data.
	 */
	if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
		if (ret < 0)
			cmn_err(CE_NOTE,
			    "!Support for CPU deep idle states is being "
			    "disabled due to errors parsing ACPI C-state "
			    "objects exported by BIOS.");
		cpu_idle_fini(cp);
		return (-1);
	}

	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);

	cpu_max_cstates = cpu_acpi_get_max_cstates(handle);

	for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
		(void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
		/*
		 * Allocate, initialize and install cstate kstat
		 */
		cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
		    name, "misc",
		    KSTAT_TYPE_NAMED,
		    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
		    KSTAT_FLAG_VIRTUAL);

		if (cstate->cs_ksp == NULL) {
			cmn_err(CE_NOTE, "kstat_create(c_state) fail");
		} else {
			cstate->cs_ksp->ks_data = &cpu_idle_kstat;
			cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
			cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
			cstate->cs_ksp->ks_data_size += MAXNAMELEN;
			cstate->cs_ksp->ks_private = cstate;
			kstat_install(cstate->cs_ksp);
		}
		cstate++;
	}

	cpupm_alloc_domains(cp, CPUPM_C_STATES);
	cpupm_alloc_ms_cstate(cp);

	if (cpu_deep_cstates_supported()) {
		uint32_t value;

		mutex_enter(&cpu_idle_callb_mutex);
		if (cpu_deep_idle_callb_id == (callb_id_t)0)
			cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
			    (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
		if (cpu_idle_cpr_callb_id == (callb_id_t)0)
			cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
			    (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
		mutex_exit(&cpu_idle_callb_mutex);


		/*
		 * All supported CPUs (Nehalem and later) will remain in C3
		 * during Bus Master activity.
		 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
		 * is not already 0 before enabling Deeper C-states.
		 */
		cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
		if (value & 1)
			cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
	}

	return (0);
}

/*
 * Free resources allocated by cpu_idle_init().
 */
static void
cpu_idle_fini(cpu_t *cp)
{
	cpupm_mach_state_t *mach_state =
	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
	cpu_acpi_cstate_t *cstate;
	uint_t	cpu_max_cstates, i;

	/*
	 * idle cpu points back to the generic one
	 */
	idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
	disp_enq_thread = non_deep_idle_disp_enq_thread;

	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
	if (cstate) {
		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);

		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
			if (cstate->cs_ksp != NULL)
				kstat_delete(cstate->cs_ksp);
			cstate++;
		}
	}

	cpupm_free_ms_cstate(cp);
	cpupm_free_domains(&cpupm_cstate_domains);
	cpu_acpi_free_cstate_data(handle);

	mutex_enter(&cpu_idle_callb_mutex);
	if (cpu_deep_idle_callb_id != (callb_id_t)0) {
		(void) callb_delete(cpu_deep_idle_callb_id);
		cpu_deep_idle_callb_id = (callb_id_t)0;
	}
	if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
		(void) callb_delete(cpu_idle_cpr_callb_id);
		cpu_idle_cpr_callb_id = (callb_id_t)0;
	}
	mutex_exit(&cpu_idle_callb_mutex);
}

/*
 * This function is introduced here to solve a race condition
 * between the master and the slave to touch c-state data structure.
 * After the slave calls this idle function to switch to the non
 * deep idle function, the master can go on to reclaim the resource.
 */
static void
cpu_idle_stop_sync(void)
{
	/* switch to the non deep idle function */
	CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
}

static void
cpu_idle_stop(cpu_t *cp)
{
	cpupm_mach_state_t *mach_state =
	    (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
	cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
	cpu_acpi_cstate_t *cstate;
	uint_t cpu_max_cstates, i = 0;

	mutex_enter(&cpu_idle_callb_mutex);
	if (idle_cpu == cpu_idle_adaptive) {
		/*
		 * invoke the slave to call synchronous idle function.
		 */
		cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
		poke_cpu(cp->cpu_id);

		/*
		 * wait until the slave switchs to non deep idle function,
		 * so that the master is safe to go on to reclaim the resource.
		 */
		while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
			drv_usecwait(10);
			if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
				cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
				    " idle stop timeout");
		}
	}
	mutex_exit(&cpu_idle_callb_mutex);

	cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
	if (cstate) {
		cpu_max_cstates = cpu_acpi_get_max_cstates(handle);

		for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
			if (cstate->cs_ksp != NULL)
				kstat_delete(cstate->cs_ksp);
			cstate++;
		}
	}
	cpupm_free_ms_cstate(cp);
	cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
	cpu_acpi_free_cstate_data(handle);
}

/*ARGSUSED*/
static boolean_t
cpu_deep_idle_callb(void *arg, int code)
{
	boolean_t rslt = B_TRUE;

	mutex_enter(&cpu_idle_callb_mutex);
	switch (code) {
	case PM_DEFAULT_CPU_DEEP_IDLE:
		/*
		 * Default policy is same as enable
		 */
		/*FALLTHROUGH*/
	case PM_ENABLE_CPU_DEEP_IDLE:
		if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
			break;

		if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
			disp_enq_thread = cstate_wakeup;
			idle_cpu = cpu_idle_adaptive;
			cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
		} else {
			rslt = B_FALSE;
		}
		break;

	case PM_DISABLE_CPU_DEEP_IDLE:
		if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
			break;

		idle_cpu = non_deep_idle_cpu;
		if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
			disp_enq_thread = non_deep_idle_disp_enq_thread;
			cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
		}
		break;

	default:
		cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
		    code);
		break;
	}
	mutex_exit(&cpu_idle_callb_mutex);
	return (rslt);
}

/*ARGSUSED*/
static boolean_t
cpu_idle_cpr_callb(void *arg, int code)
{
	boolean_t rslt = B_TRUE;

	mutex_enter(&cpu_idle_callb_mutex);
	switch (code) {
	case CB_CODE_CPR_RESUME:
		if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
			/*
			 * Do not enable dispatcher hooks if disabled by user.
			 */
			if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
				break;

			disp_enq_thread = cstate_wakeup;
			idle_cpu = cpu_idle_adaptive;
		} else {
			rslt = B_FALSE;
		}
		break;

	case CB_CODE_CPR_CHKPT:
		idle_cpu = non_deep_idle_cpu;
		disp_enq_thread = non_deep_idle_disp_enq_thread;
		(void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
		break;

	default:
		cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
		break;
	}
	mutex_exit(&cpu_idle_callb_mutex);
	return (rslt);
}

/*
 * handle _CST notification
 */
void
cpuidle_cstate_instance(cpu_t *cp)
{
#ifndef	__xpv
	cpupm_mach_state_t	*mach_state =
	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
	cpu_acpi_handle_t	handle;
	struct machcpu		*mcpu;
	cpuset_t 		dom_cpu_set;
	kmutex_t		*pm_lock;
	int			result = 0;
	processorid_t		cpu_id;

	if (mach_state == NULL) {
		return;
	}

	ASSERT(mach_state->ms_cstate.cma_domain != NULL);
	dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
	pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;

	/*
	 * Do for all the CPU's in the domain
	 */
	mutex_enter(pm_lock);
	do {
		CPUSET_FIND(dom_cpu_set, cpu_id);
		if (cpu_id == CPUSET_NOTINSET)
			break;

		ASSERT(cpu_id >= 0 && cpu_id < NCPU);
		cp = cpu[cpu_id];
		mach_state = (cpupm_mach_state_t *)
		    cp->cpu_m.mcpu_pm_mach_state;
		if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
			mutex_exit(pm_lock);
			return;
		}
		handle = mach_state->ms_acpi_handle;
		ASSERT(handle != NULL);

		/*
		 * re-evaluate cstate object
		 */
		if (cpu_acpi_cache_cstate_data(handle) != 0) {
			cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
			    " object Instance: %d", cpu_id);
		}
		mcpu = &(cp->cpu_m);
		mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
		if (mcpu->max_cstates > CPU_ACPI_C1) {
			(void) cstate_timer_callback(
			    CST_EVENT_MULTIPLE_CSTATES);
			disp_enq_thread = cstate_wakeup;
			cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
		} else if (mcpu->max_cstates == CPU_ACPI_C1) {
			disp_enq_thread = non_deep_idle_disp_enq_thread;
			cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
			(void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
		}

		CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
	} while (result < 0);
	mutex_exit(pm_lock);
#endif
}

/*
 * handle the number or the type of available processor power states change
 */
void
cpuidle_manage_cstates(void *ctx)
{
	cpu_t			*cp = ctx;
	cpupm_mach_state_t	*mach_state =
	    (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
	boolean_t		is_ready;

	if (mach_state == NULL) {
		return;
	}

	/*
	 * We currently refuse to power manage if the CPU is not ready to
	 * take cross calls (cross calls fail silently if CPU is not ready
	 * for it).
	 *
	 * Additionally, for x86 platforms we cannot power manage an instance,
	 * until it has been initialized.
	 */
	is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
	if (!is_ready)
		return;

	cpuidle_cstate_instance(cp);
}