/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/mutex.h>
#include <sys/cpuvar.h>
#include <sys/cyclic.h>
#include <sys/disp.h>
#include <sys/ddi.h>
#include <sys/wdt.h>
#include <sys/callb.h>
#include <sys/cmn_err.h>
#include <sys/hypervisor_api.h>
#include <sys/membar.h>
#include <sys/x_call.h>
#include <sys/promif.h>
#include <sys/systm.h>
#include <sys/mach_descrip.h>
#include <sys/cpu_module.h>
#include <sys/pg.h>
#include <sys/lgrp.h>
#include <sys/sysmacros.h>
#include <sys/sunddi.h>
#include <sys/cpupart.h>
#include <sys/hsvc.h>
#include <vm/hat_sfmmu.h>

/*
 * Sun4v OS Suspend
 *
 * Provides a means to suspend a sun4v guest domain by pausing CPUs and then
 * calling into the HV to initiate a suspension. Suspension is sequenced
 * externally by calling suspend_pre, suspend_start, and suspend_post.
 * suspend_pre and suspend_post are meant to perform any special operations
 * that should be done before or after a suspend/resume operation. e.g.,
 * callbacks to cluster software to disable heartbeat monitoring before the
 * system is suspended. suspend_start prepares kernel services to be suspended
 * and then suspends the domain by calling hv_guest_suspend.
 *
 * Special Handling for %tick and %stick Registers
 *
 * After a suspend/resume operation, the %tick and %stick registers may have
 * jumped forwards or backwards. The delta is assumed to be consistent across
 * all CPUs, within the negligible level of %tick and %stick variation
 * acceptable on a cold boot. In order to maintain increasing %tick and %stick
 * counter values without exposing large positive or negative jumps to kernel
 * or user code, a %tick and %stick offset is used. Kernel reads of these
 * counters return the sum of the hardware register counter and offset
 * variable. After a suspend/resume operation, user reads of %tick or %stick
 * are emulated. Suspend code enables emulation by setting the
 * %{tick,stick}.NPT fields which trigger a privileged instruction access
 * trap whenever the registers are read from user mode. If emulation has been
 * enabled, the trap handler emulates the instruction. Emulation is only
 * enabled during a successful suspend/resume operation. When emulation is
 * enabled, CPUs that are DR'd into the system will have their
 * %{tick,stick}.NPT bits set to 1 as well.
 */

extern u_longlong_t gettick(void);	/* returns %stick */
extern uint64_t gettick_counter(void);	/* returns %tick */
extern uint64_t gettick_npt(void);
extern uint64_t getstick_npt(void);
extern int mach_descrip_update(void);
extern cpuset_t cpu_ready_set;
extern uint64_t native_tick_offset;
extern uint64_t native_stick_offset;

/*
 * Global Sun Cluster pre/post callbacks.
 */
const char *(*cl_suspend_error_decode)(int);
int (*cl_suspend_pre_callback)(void);
int (*cl_suspend_post_callback)(void);
#define	SC_PRE_FAIL_STR_FMT	"Sun Cluster pre-suspend failure: %d"
#define	SC_POST_FAIL_STR_FMT	"Sun Cluster post-suspend failure: %d"
#define	SC_FAIL_STR_MAX		256

/*
 * The minimum major and minor version of the HSVC_GROUP_CORE API group
 * required in order to use OS suspend.
 */
#define	SUSPEND_CORE_MAJOR	1
#define	SUSPEND_CORE_MINOR	2

/*
 * By default, sun4v OS suspend is supported if the required HV version
 * is present. suspend_disabled should be set on platforms that do not
 * allow OS suspend regardless of whether or not the HV supports it.
 * It can also be set in /etc/system.
 */
static int suspend_disabled = 0;

/*
 * Controls whether or not user-land tick and stick register emulation
 * will be enabled following a successful suspend operation.
 */
static int enable_user_tick_stick_emulation = 1;

/*
 * Indicates whether or not tick and stick emulation is currently active.
 * After a successful suspend operation, if emulation is enabled, this
 * variable is set to B_TRUE. Global scope to allow emulation code to
 * check if emulation is active.
 */
boolean_t tick_stick_emulation_active = B_FALSE;

/*
 * When non-zero, after a successful suspend and resume, cpunodes, CPU HW
 * sharing data structures, and processor groups will be updated using
 * information from the updated MD.
 */
static int suspend_update_cpu_mappings = 1;

/*
 * DBG and DBG_PROM() macro.
 */
#ifdef	DEBUG

static int suspend_debug_flag = 0;

#define	DBG_PROM		\
if (suspend_debug_flag)		\
	prom_printf

#define	DBG			\
if (suspend_debug_flag)		\
	suspend_debug

static void
suspend_debug(const char *fmt, ...)
{
	char	buf[512];
	va_list	ap;

	va_start(ap, fmt);
	(void) vsprintf(buf, fmt, ap);
	va_end(ap);

	cmn_err(CE_NOTE, "%s", buf);
}

#else /* DEBUG */

#define	DBG_PROM
#define	DBG

#endif /* DEBUG */

/*
 * Return true if the HV supports OS suspend and if suspend has not been
 * disabled on this platform.
 */
boolean_t
suspend_supported(void)
{
	uint64_t major, minor;

	if (suspend_disabled)
		return (B_FALSE);

	if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
		return (B_FALSE);

	return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) ||
	    (major > SUSPEND_CORE_MAJOR));
}

/*
 * Given a source tick and stick value, set the tick and stick offsets such
 * that the (current physical register value + offset == source value).
 */
static void
set_tick_offsets(uint64_t source_tick, uint64_t source_stick)
{
	uint64_t target_tick;
	uint64_t target_stick;

	native_tick_offset = 0;
	native_stick_offset = 0;

	target_tick = gettick_counter();	/* returns %tick */
	target_stick = gettick();		/* returns %stick */

	native_tick_offset = source_tick - target_tick;
	native_stick_offset = source_stick - target_stick;
}

/*
 * Set the {tick,stick}.NPT field to 1 on this CPU.
 */
static void
enable_tick_stick_npt(void)
{
	(void) hv_stick_set_npt(1);
	(void) hv_tick_set_npt(1);
}

/*
 * Synchronize a CPU's {tick,stick}.NPT fields with the current state
 * of the system. This is used when a CPU is DR'd into the system.
 */
void
suspend_sync_tick_stick_npt(void)
{
	if (tick_stick_emulation_active) {
		DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id);
		(void) hv_stick_set_npt(1);
		(void) hv_tick_set_npt(1);
	} else {
		ASSERT(gettick_npt() == 0);
		ASSERT(getstick_npt() == 0);
	}
}

/*
 * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
 * sharing data structures, and processor groups.
 */
static void
update_cpu_mappings(void)
{
	md_t		*mdp;
	processorid_t	id;
	cpu_t		*cp;
	cpu_pg_t	*pgps[NCPU];

	if ((mdp = md_get_handle()) == NULL) {
		DBG("suspend: md_get_handle failed");
		return;
	}

	DBG("suspend: updating CPU mappings");

	mutex_enter(&cpu_lock);

	setup_chip_mappings(mdp);
	setup_exec_unit_mappings(mdp);
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		cpu_map_exec_units(cp);
	}

	/*
	 * Re-calculate processor groups.
	 *
	 * First tear down all PG information before adding any new PG
	 * information derived from the MD we just downloaded. We must
	 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and
	 * we want to minimize the number of times pause_cpus is called.
	 * Inactivating all CPUs would leave PGs without any active CPUs,
	 * so while CPUs are paused, call pg_cpu_inactive and swap in the
	 * bootstrap PG structure saving the original PG structure to be
	 * fini'd afterwards. This prevents the dispatcher from encountering
	 * PGs in which all CPUs are inactive.
	 */
	pause_cpus(NULL);
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		pg_cpu_inactive(cp);
		pgps[id] = cp->cpu_pg;
		pg_cpu_bootstrap(cp);
	}
	start_cpus();

	/*
	 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
	 * not paused. Use two separate loops here so that we do not
	 * initialize PG data for CPUs until all the old PG data structures
	 * are torn down.
	 */
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		pg_cpu_fini(cp, pgps[id]);
	}

	/*
	 * Initialize PG data for each CPU, but leave the bootstrapped
	 * PG structure in place to avoid running with any PGs containing
	 * nothing but inactive CPUs.
	 */
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		pgps[id] = pg_cpu_init(cp, B_TRUE);
	}

	/*
	 * Now that PG data has been initialized for all CPUs in the
	 * system, replace the bootstrapped PG structure with the
	 * initialized PG structure and call pg_cpu_active for each CPU.
	 */
	pause_cpus(NULL);
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		cp->cpu_pg = pgps[id];
		pg_cpu_active(cp);
	}
	start_cpus();

	mutex_exit(&cpu_lock);

	(void) md_fini_handle(mdp);
}

/*
 * Wrapper for the Sun Cluster error decoding function.
 */
static int
cluster_error_decode(int error, char *error_reason, size_t max_reason_len)
{
	const char	*decoded;
	size_t		decoded_len;

	ASSERT(error_reason != NULL);
	ASSERT(max_reason_len > 0);

	max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX);

	if (cl_suspend_error_decode == NULL)
		return (-1);

	if ((decoded = (*cl_suspend_error_decode)(error)) == NULL)
		return (-1);

	/* Get number of non-NULL bytes */
	if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0)
		return (-1);

	bcopy(decoded, error_reason, decoded_len);

	/*
	 * The error string returned from cl_suspend_error_decode
	 * should be NULL-terminated, but set the terminator here
	 * because we only copied non-NULL bytes. If the decoded
	 * string was not NULL-terminated, this guarantees that
	 * error_reason will be.
	 */
	error_reason[decoded_len] = '\0';

	return (0);
}

/*
 * Wrapper for the Sun Cluster pre-suspend callback.
 */
static int
cluster_pre_wrapper(char *error_reason, size_t max_reason_len)
{
	int rv = 0;

	if (cl_suspend_pre_callback != NULL) {
		rv = (*cl_suspend_pre_callback)();
		DBG("suspend: cl_suspend_pre_callback returned %d", rv);
		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
			if (cluster_error_decode(rv, error_reason,
			    max_reason_len)) {
				(void) snprintf(error_reason, max_reason_len,
				    SC_PRE_FAIL_STR_FMT, rv);
			}
		}
	}

	return (rv);
}

/*
 * Wrapper for the Sun Cluster post-suspend callback.
 */
static int
cluster_post_wrapper(char *error_reason, size_t max_reason_len)
{
	int rv = 0;

	if (cl_suspend_post_callback != NULL) {
		rv = (*cl_suspend_post_callback)();
		DBG("suspend: cl_suspend_post_callback returned %d", rv);
		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
			if (cluster_error_decode(rv, error_reason,
			    max_reason_len)) {
				(void) snprintf(error_reason,
				    max_reason_len, SC_POST_FAIL_STR_FMT, rv);
			}
		}
	}

	return (rv);
}

/*
 * Execute pre-suspend callbacks preparing the system for a suspend operation.
 * Returns zero on success, non-zero on failure. Sets the recovered argument
 * to indicate whether or not callbacks could be undone in the event of a
 * failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
 * otherwise *recovered is set to B_FALSE. Must be called successfully before
 * suspend_start can be called. Callers should first call suspend_support to
 * determine if OS suspend is supported.
 */
int
suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered)
{
	int rv;

	ASSERT(recovered != NULL);

	/*
	 * Return an error if suspend_pre is erreoneously called
	 * when OS suspend is not supported.
	 */
	ASSERT(suspend_supported());
	if (!suspend_supported()) {
		DBG("suspend: suspend_pre called without suspend support");
		*recovered = B_TRUE;
		return (ENOTSUP);
	}
	DBG("suspend: %s", __func__);

	rv = cluster_pre_wrapper(error_reason, max_reason_len);

	/*
	 * At present, only one pre-suspend operation exists.
	 * If it fails, no recovery needs to be done.
	 */
	if (rv != 0 && recovered != NULL)
		*recovered = B_TRUE;

	return (rv);
}

/*
 * Execute post-suspend callbacks. Returns zero on success, non-zero on
 * failure. Must be called after suspend_start is called, regardless of
 * whether or not suspend_start is successful.
 */
int
suspend_post(char *error_reason, size_t max_reason_len)
{
	ASSERT(suspend_supported());
	DBG("suspend: %s", __func__);
	return (cluster_post_wrapper(error_reason, max_reason_len));
}

/*
 * Suspends the OS by pausing CPUs and calling into the HV to initiate
 * the suspend. When the HV routine hv_guest_suspend returns, the system
 * will be resumed. Must be called after a successful call to suspend_pre.
 * suspend_post must be called after suspend_start, whether or not
 * suspend_start returns an error.
 */
/*ARGSUSED*/
int
suspend_start(char *error_reason, size_t max_reason_len)
{
	uint64_t	source_tick;
	uint64_t	source_stick;
	uint64_t	rv;
	timestruc_t	source_tod;
	int		spl;

	ASSERT(suspend_supported());
	DBG("suspend: %s", __func__);

	sfmmu_ctxdoms_lock();

	mutex_enter(&cpu_lock);

	/* Suspend the watchdog */
	watchdog_suspend();

	/* Record the TOD */
	mutex_enter(&tod_lock);
	source_tod = tod_get();
	mutex_exit(&tod_lock);

	/* Pause all other CPUs */
	pause_cpus(NULL);
	DBG_PROM("suspend: CPUs paused\n");

	/* Suspend cyclics and disable interrupts */
	cyclic_suspend();
	DBG_PROM("suspend: cyclics suspended\n");
	spl = spl8();

	source_tick = gettick_counter();
	source_stick = gettick();
	DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick);
	DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick);

	/*
	 * Call into the HV to initiate the suspend.
	 * hv_guest_suspend() returns after the guest has been
	 * resumed or if the suspend operation failed or was
	 * cancelled. After a successful suspend, the %tick and
	 * %stick registers may have changed by an amount that is
	 * not proportional to the amount of time that has passed.
	 * They may have jumped forwards or backwards. This jump
	 * must be uniform across all CPUs and we operate under
	 * the assumption that it is (maintaining two global offset
	 * variables--one for %tick and one for %stick.)
	 */
	DBG_PROM("suspend: suspending... \n");
	rv = hv_guest_suspend();
	if (rv != 0) {
		splx(spl);
		cyclic_resume();
		start_cpus();
		watchdog_resume();
		mutex_exit(&cpu_lock);
		sfmmu_ctxdoms_unlock();
		DBG("suspend: failed, rv: %ld\n", rv);
		return (rv);
	}

	/* Update the global tick and stick offsets */
	set_tick_offsets(source_tick, source_stick);

	/* Ensure new offsets are globally visible before resuming CPUs */
	membar_sync();

	/* Enable interrupts */
	splx(spl);

	/* Set the {%tick,%stick}.NPT bits on all CPUs */
	if (enable_user_tick_stick_emulation) {
		xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL);
		xt_sync(cpu_ready_set);
		ASSERT(gettick_npt() != 0);
		ASSERT(getstick_npt() != 0);
	}

	/* If emulation is enabled, but not currently active, enable it */
	if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) {
		tick_stick_emulation_active = B_TRUE;
	}

	sfmmu_ctxdoms_remove();

	/* Resume cyclics, unpause CPUs */
	cyclic_resume();
	start_cpus();

	/* Set the TOD */
	mutex_enter(&tod_lock);
	tod_set(source_tod);
	mutex_exit(&tod_lock);

	/* Re-enable the watchdog */
	watchdog_resume();

	mutex_exit(&cpu_lock);

	/* Download the latest MD */
	if ((rv = mach_descrip_update()) != 0)
		cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld",
		    rv);

	sfmmu_ctxdoms_update();
	sfmmu_ctxdoms_unlock();

	/* Get new MD, update CPU mappings/relationships */
	if (suspend_update_cpu_mappings)
		update_cpu_mappings();

	DBG("suspend: target tick: 0x%lx", gettick_counter());
	DBG("suspend: target stick: 0x%llx", gettick());
	DBG("suspend: user %%tick/%%stick emulation is %d",
	    tick_stick_emulation_active);
	DBG("suspend: finished");

	return (0);
}