/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <string.h>
#include <alloca.h>
#include <stdlib.h>
#include <stdio.h>
#include <libintl.h>

#include "libcpc.h"
#include "libcpc_impl.h"

/*
 * Configuration data for Pentium Pro performance counters.
 *
 * Definitions taken from [3].  See the reference to
 * understand what any of these settings actually means.
 *
 * [3] "Pentium Pro Family Developer's Manual, Volume 3:
 *     Operating Systems Writer's Manual," January 1996
 */

#define	V_P5	(1u << 0)		/* specific to Pentium cpus */
#define	V_P5mmx	(1u << 1)		/* " MMX instructions */
#define	V_P6	(1u << 2)		/* specific to Pentium II cpus */
#define	V_P6mmx	(1u << 3)		/* " MMX instructions */
#define	V_END	0

/*
 * map from "cpu version" to flag bits
 */
static const uint_t cpuvermap[] = {
	V_P5,		/* CPC_PENTIUM */
	V_P5 | V_P5mmx,	/* CPC_PENTIUM_MMX */
	V_P6,		/* CPC_PENTIUM_PRO */
	V_P6 | V_P6mmx,	/* CPC_PENTIUM_PRO_MMX */
};

struct nametable {
	const uint_t	ver;
	const uint8_t	bits;
	const char	*name;
};

/*
 * Basic Pentium events
 */
#define	P5_EVENTS(v)						\
	{v,		0x0,	"data_read"},			\
	{v,		0x1,	"data_write"},			\
	{v,		0x2,	"data_tlb_miss"},		\
	{v,		0x3,	"data_read_miss"},		\
	{v,		0x4,	"data_write_miss"},		\
	{v,		0x5,	"write_hit_to_M_or_E"},		\
	{v,		0x6,	"dcache_lines_wrback"},		\
	{v,		0x7,	"external_snoops"},		\
	{v,		0x8,	"external_dcache_snoop_hits"},	\
	{v,		0x9,	"memory_access_in_both_pipes"},	\
	{v,		0xa,	"bank_conflicts"},		\
	{v,		0xb,	"misaligned_ref"},		\
	{v,		0xc,	"code_read"},			\
	{v,		0xd,	"code_tlb_miss"},		\
	{v,		0xe,	"code_cache_miss"},		\
	{v,		0xf,	"any_segreg_loaded"},		\
	{v,		0x12,	"branches"},			\
	{v,		0x13,	"btb_hits"},			\
	{v,		0x14,	"taken_or_btb_hit"},		\
	{v,		0x15,	"pipeline_flushes"},		\
	{v,		0x16,	"instr_exec"},			\
	{v,		0x17,	"instr_exec_V_pipe"},		\
	{v,		0x18,	"clks_bus_cycle"},		\
	{v,		0x19,	"clks_full_wbufs"},		\
	{v,		0x1a,	"pipe_stall_read"},		\
	{v,		0x1b,	"stall_on_write_ME"},		\
	{v,		0x1c,	"locked_bus_cycle"},		\
	{v,		0x1d,	"io_rw_cycles"},		\
	{v,		0x1e,	"reads_noncache_mem"},		\
	{v,		0x1f,	"pipeline_agi_stalls"},		\
	{v,		0x22,	"flops"},			\
	{v,		0x23,	"bp_match_dr0"},		\
	{v,		0x24,	"bp_match_dr1"},		\
	{v,		0x25,	"bp_match_dr2"},		\
	{v,		0x26,	"bp_match_dr3"},		\
	{v,		0x27,	"hw_intrs"},			\
	{v,		0x28,	"data_rw"},			\
	{v,		0x29,	"data_rw_miss"}

static const struct nametable P5mmx_names0[] = {
	P5_EVENTS(V_P5),
	{V_P5mmx,	0x2a,	"bus_ownership_latency"},
	{V_P5mmx,	0x2b,	"mmx_instr_upipe"},
	{V_P5mmx,	0x2c,	"cache_M_line_sharing"},
	{V_P5mmx,	0x2d,	"emms_instr"},
	{V_P5mmx,	0x2e,	"bus_util_processor"},
	{V_P5mmx,	0x2f,	"sat_mmx_instr"},
	{V_P5mmx,	0x30,	"clks_not_HLT"},
	{V_P5mmx,	0x31,	"mmx_data_read"},
	{V_P5mmx,	0x32,	"clks_fp_stall"},
	{V_P5mmx,	0x33,	"d1_starv_fifo_0"},
	{V_P5mmx,	0x34,	"mmx_data_write"},
	{V_P5mmx,	0x35,	"pipe_flush_wbp"},
	{V_P5mmx,	0x36,	"mmx_misalign_data_refs"},
	{V_P5mmx,	0x37,	"rets_pred_incorrect"},
	{V_P5mmx,	0x38,	"mmx_multiply_unit_interlock"},
	{V_P5mmx,	0x39,	"rets"},
	{V_P5mmx,	0x3a,	"btb_false_entries"},
	{V_P5mmx,	0x3b,	"clocks_stall_full_wb"},
	{V_END}
};

static const struct nametable P5mmx_names1[] = {
	P5_EVENTS(V_P5),
	{V_P5mmx,	0x2a,	"bus_ownership_transfers"},
	{V_P5mmx,	0x2b,	"mmx_instr_vpipe"},
	{V_P5mmx,	0x2c,	"cache_lint_sharing"},
	{V_P5mmx,	0x2d,	"mmx_fp_transitions"},
	{V_P5mmx,	0x2e,	"writes_noncache_mem"},
	{V_P5mmx,	0x2f,	"sats_performed"},
	{V_P5mmx,	0x30,	"clks_dcache_tlb_miss"},
	{V_P5mmx,	0x31,	"mmx_data_read_miss"},
	{V_P5mmx,	0x32,	"taken_br"},
	{V_P5mmx,	0x33,	"d1_starv_fifo_1"},
	{V_P5mmx,	0x34,	"mmx_data_write_miss"},
	{V_P5mmx,	0x35,	"pipe_flush_wbp_wb"},
	{V_P5mmx,	0x36,	"mmx_pipe_stall_data_read"},
	{V_P5mmx,	0x37,	"rets_pred"},
	{V_P5mmx,	0x38,	"movd_movq_stall"},
	{V_P5mmx,	0x39,	"rsb_overflow"},
	{V_P5mmx,	0x3a,	"btb_mispred_nt"},
	{V_P5mmx,	0x3b,	"mmx_stall_write_ME"},
	{V_END}
};

static const struct nametable *P5mmx_names[2] = {
	P5mmx_names0,
	P5mmx_names1
};

/*
 * Pentium Pro and Pentium II events
 */
static const struct nametable P6_names[] = {
	/*
	 * Data cache unit
	 */
	{V_P6,		0x43,	"data_mem_refs"},
	{V_P6,		0x45,	"dcu_lines_in"},
	{V_P6,		0x46,	"dcu_m_lines_in"},
	{V_P6,		0x47,	"dcu_m_lines_out"},
	{V_P6,		0x48,	"dcu_miss_outstanding"},

	/*
	 * Instruction fetch unit
	 */
	{V_P6,		0x80,	"ifu_ifetch"},
	{V_P6,		0x81,	"ifu_ifetch_miss"},
	{V_P6,		0x85,	"itlb_miss"},
	{V_P6,		0x86,	"ifu_mem_stall"},
	{V_P6,		0x87,	"ild_stall"},

	/*
	 * L2 cache
	 */
	{V_P6,		0x28,	"l2_ifetch"},
	{V_P6,		0x29,	"l2_ld"},
	{V_P6,		0x2a,	"l2_st"},
	{V_P6,		0x24,	"l2_lines_in"},
	{V_P6,		0x26,	"l2_lines_out"},
	{V_P6,		0x25,	"l2_m_lines_inm"},
	{V_P6,		0x27,	"l2_m_lines_outm"},
	{V_P6,		0x2e,	"l2_rqsts"},
	{V_P6,		0x21,	"l2_ads"},
	{V_P6,		0x22,	"l2_dbus_busy"},
	{V_P6,		0x23,	"l2_dbus_busy_rd"},

	/*
	 * External bus logic
	 */
	{V_P6,		0x62,	"bus_drdy_clocks"},
	{V_P6,		0x63,	"bus_lock_clocks"},
	{V_P6,		0x60,	"bus_req_outstanding"},
	{V_P6,		0x65,	"bus_tran_brd"},
	{V_P6,		0x66,	"bus_tran_rfo"},
	{V_P6,		0x67,	"bus_trans_wb"},
	{V_P6,		0x68,	"bus_tran_ifetch"},
	{V_P6,		0x69,	"bus_tran_inval"},
	{V_P6,		0x6a,	"bus_tran_pwr"},
	{V_P6,		0x6b,	"bus_trans_p"},
	{V_P6,		0x6c,	"bus_trans_io"},
	{V_P6,		0x6d,	"bus_tran_def"},
	{V_P6,		0x6e,	"bus_tran_burst"},
	{V_P6,		0x70,	"bus_tran_any"},
	{V_P6,		0x6f,	"bus_tran_mem"},
	{V_P6,		0x64,	"bus_data_rcv"},
	{V_P6,		0x61,	"bus_bnr_drv"},
	{V_P6,		0x7a,	"bus_hit_drv"},
	{V_P6,		0x7b,	"bus_hitm_drv"},
	{V_P6,		0x7e,	"bus_snoop_stall"},

	/*
	 * Floating point unit
	 */
	{V_P6,		0xc1,	"flops"},		/* 0 only */
	{V_P6,		0x10,	"fp_comp_ops_exe"},	/* 0 only */
	{V_P6,		0x11,	"fp_assist"},		/* 1 only */
	{V_P6,		0x12,	"mul"},			/* 1 only */
	{V_P6,		0x13,	"div"},			/* 1 only */
	{V_P6,		0x14,	"cycles_div_busy"},	/* 0 only */

	/*
	 * Memory ordering
	 */
	{V_P6,		0x3,	"ld_blocks"},
	{V_P6,		0x4,	"sb_drains"},
	{V_P6,		0x5,	"misalign_mem_ref"},

	/*
	 * Instruction decoding and retirement
	 */
	{V_P6,		0xc0,	"inst_retired"},
	{V_P6,		0xc2,	"uops_retired"},
	{V_P6,		0xd0,	"inst_decoder"},

	/*
	 * Interrupts
	 */
	{V_P6,		0xc8,	"hw_int_rx"},
	{V_P6,		0xc6,	"cycles_int_masked"},
	{V_P6,		0xc7,	"cycles_int_pending_and_masked"},

	/*
	 * Branches
	 */
	{V_P6,		0xc4,	"br_inst_retired"},
	{V_P6,		0xc5,	"br_miss_pred_retired"},
	{V_P6,		0xc9,	"br_taken_retired"},
	{V_P6,		0xca,	"br_miss_pred_taken_ret"},
	{V_P6,		0xe0,	"br_inst_decoded"},
	{V_P6,		0xe2,	"btb_misses"},
	{V_P6,		0xe4,	"br_bogus"},
	{V_P6,		0xe6,	"baclears"},

	/*
	 * Stalls
	 */
	{V_P6,		0xa2,	"resource_stalls"},
	{V_P6,		0xd2,	"partial_rat_stalls"},

	/*
	 * Segment register loads
	 */
	{V_P6,		0x6,	"segment_reg_loads"},

	/*
	 * Clocks
	 */
	{V_P6,		0x79,	"cpu_clk_unhalted"},

	/*
	 * MMX
	 */
	{V_P6mmx,	0xb0,	"mmx_instr_exec"},
	{V_P6mmx,	0xb1,	"mmx_sat_instr_exec"},
	{V_P6mmx,	0xb2,	"mmx_uops_exec"},
	{V_P6mmx,	0xb3,	"mmx_instr_type_exec"},
	{V_P6mmx,	0xcc,	"fp_mmx_trans"},
	{V_P6mmx,	0xcd,	"mmx_assists"},
	{V_P6mmx,	0xce,	"mmx_instr_ret"},
	{V_P6mmx,	0xd4,	"seg_rename_stalls"},
	{V_P6mmx,	0xd5,	"seg_reg_renames"},
	{V_P6mmx,	0xd6,	"ret_seg_renames"},

	{V_END}
};

#define	MAPCPUVER(cpuver)	(cpuvermap[(cpuver) - CPC_PENTIUM])

static int
validargs(int cpuver, int regno)
{
	if (regno < 0 || regno > 1)
		return (0);
	cpuver -= CPC_PENTIUM;
	if (cpuver < 0 ||
	    cpuver >= sizeof (cpuvermap) / sizeof (cpuvermap[0]))
		return (0);
	return (1);
}

/*ARGSUSED*/
static int
versionmatch(int cpuver, int regno, const struct nametable *n)
{
	if (!validargs(cpuver, regno) || (n->ver & MAPCPUVER(cpuver)) == 0)
		return (0);

	switch (MAPCPUVER(cpuver)) {
	case V_P5:
	case V_P5 | V_P5mmx:
		break;
	case V_P6:
	case V_P6 | V_P6mmx:
		switch (n->bits) {
		case 0xc1:	/* flops */
		case 0x10:	/* fp_comp_ops_exe */
		case 0x14:	/* cycles_div_busy */
			/* only reg0 counts these */
			if (regno == 1)
				return (0);
			break;
		case 0x11:	/* fp_assist */
		case 0x12:	/* mul */
		case 0x13:	/* div */
			/* only 1 can count these */
			if (regno == 0)
				return (0);
			break;
		default:
			break;
		}
		break;
	default:
		return (0);
	}

	return (1);
}

static const struct nametable *
getnametable(int cpuver, int regno)
{
	const struct nametable *n;

	if (!validargs(cpuver, regno))
		return (NULL);

	switch (MAPCPUVER(cpuver)) {
	case V_P5:
	case V_P5 | V_P5mmx:
		n = P5mmx_names[regno];
		break;
	case V_P6:
	case V_P6 | V_P6mmx:
		n = P6_names;
		break;
	default:
		n = NULL;
		break;
	}

	return (n);
}

void
cpc_walk_names(int cpuver, int regno, void *arg,
    void (*action)(void *, int, const char *, uint8_t))
{
	const struct nametable *n;

	if ((n = getnametable(cpuver, regno)) == NULL)
		return;
	for (; n->ver != V_END; n++)
		if (versionmatch(cpuver, regno, n))
			action(arg, regno, n->name, n->bits);
}

const char *
__cpc_reg_to_name(int cpuver, int regno, uint8_t bits)
{
	const struct nametable *n;

	if ((n = getnametable(cpuver, regno)) == NULL)
		return (NULL);
	for (; n->ver != V_END; n++)
		if (bits == n->bits && versionmatch(cpuver, regno, n))
			return (n->name);
	return (NULL);
}

/*
 * Register names can be specified as strings or even as numbers
 */
int
__cpc_name_to_reg(int cpuver, int regno, const char *name, uint8_t *bits)
{
	const struct nametable *n;
	char *eptr = NULL;
	long value;

	if ((n = getnametable(cpuver, regno)) == NULL || name == NULL)
		return (-1);
	for (; n->ver != V_END; n++)
		if (strcmp(name, n->name) == 0 &&
		    versionmatch(cpuver, regno, n)) {
			*bits = n->bits;
			return (0);
		}

	value = strtol(name, &eptr, 0);
	if (name != eptr && value >= 0 && value <= UINT8_MAX) {
		*bits = (uint8_t)value;
		return (0);
	}

	return (-1);
}

const char *
cpc_getcciname(int cpuver)
{
	if (validargs(cpuver, 0))
		switch (MAPCPUVER(cpuver)) {
		case V_P5:
			return ("Pentium");
		case V_P5 | V_P5mmx:
			return ("Pentium with MMX");
		case V_P6:
			return ("Pentium Pro, Pentium II");
		case V_P6 | V_P6mmx:
			return ("Pentium Pro with MMX, Pentium II");
		default:
			break;
		}
	return (NULL);
}

const char *
cpc_getcpuref(int cpuver)
{
	if (validargs(cpuver, 0))
		switch (MAPCPUVER(cpuver)) {
		case V_P5:
		case V_P5 | V_P5mmx:
			return (gettext(
			    "See Appendix A.2 of the \"Intel Architecture "
			    "Software Developer's Manual,\" 243192, 1997"));
		case V_P6:
		case V_P6 | V_P6mmx:
			return (gettext(
			    "See Appendix A.1 of the \"Intel Architecture "
			    "Software Developer's Manual,\" 243192, 1997"));
		default:
			break;
		}
	return (NULL);
}

/*
 * This is a functional interface to allow CPUs with fewer %pic registers
 * to share the same data structure as those with more %pic registers
 * within the same instruction set family.
 */
uint_t
cpc_getnpic(int cpuver)
{
	switch (cpuver) {
	case CPC_PENTIUM:
	case CPC_PENTIUM_MMX:
	case CPC_PENTIUM_PRO:
	case CPC_PENTIUM_PRO_MMX:
#define	EVENT	((cpc_event_t *)0)
		return (sizeof (EVENT->ce_pic) / sizeof	(EVENT->ce_pic[0]));
#undef	EVENT
	default:
		return (0);
	}
}

#define	BITS(v, u, l)	\
	(((v) >> (l)) & ((1 << (1 + (u) - (l))) - 1))

#include "getcpuid.h"

/*
 * Return the version of the current processor.
 *
 * Version -1 is defined as 'not performance counter capable'
 */
int
cpc_getcpuver(void)
{
	static int ver = -1;
	uint32_t maxeax;
	uint32_t vbuf[4];

	if (ver != -1)
		return (ver);

	maxeax = cpc_getcpuid(0, &vbuf[0], &vbuf[2], &vbuf[1]);
	{
		char *vendor = (char *)vbuf;
		vendor[12] = '\0';

		if (strcmp(vendor, "GenuineIntel") != 0)
			return (ver);
	}

	if (maxeax >= 1) {
		int family, model;
		uint32_t eax, ebx, ecx, edx;

		eax = cpc_getcpuid(1, &ebx, &ecx, &edx);

		if ((family = BITS(eax, 11, 8)) == 0xf)
			family = BITS(eax, 27, 20);
		if ((model = BITS(eax, 7, 4)) == 0xf)
			model = BITS(eax, 19, 16);

		/*
		 * map family and model into the performance
		 * counter architectures we currently understand.
		 *
		 * See application note AP485 (from developer.intel.com)
		 * for further explanation.
		 */
		switch (family) {
		case 5:		/* Pentium and Pentium with MMX */
			ver = model < 4 ?
				CPC_PENTIUM : CPC_PENTIUM_MMX;
			break;
		case 6:		/* Pentium Pro and Pentium II and III */
			ver = BITS(edx, 23, 23) ?	   /* mmx check */
				CPC_PENTIUM_PRO_MMX : CPC_PENTIUM_PRO;
			break;
		default:
		case 0xf:	/* Pentium IV */
			break;
		}
	}

	return (ver);
}