/*-
 * Copyright (C) 1996 Wolfgang Solfrank.
 * Copyright (C) 1996 TooLs GmbH.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by TooLs GmbH.
 * 4. The name of TooLs GmbH may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *	$NetBSD: fpu.c,v 1.5 2001/07/22 11:29:46 wiz Exp $
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/limits.h>

#include <machine/altivec.h>
#include <machine/fpu.h>
#include <machine/ieeefp.h>
#include <machine/pcb.h>
#include <machine/psl.h>

#include <powerpc/fpu/fpu_arith.h>
#include <powerpc/fpu/fpu_emu.h>
#include <powerpc/fpu/fpu_extern.h>

void spe_handle_fpdata(struct trapframe *);
void spe_handle_fpround(struct trapframe *);
static int spe_emu_instr(uint32_t, struct fpemu *, struct fpn **, uint32_t *);

static void
save_vec_int(struct thread *td)
{
	int	msr;
	struct	pcb *pcb;

	pcb = td->td_pcb;

	/*
	 * Temporarily re-enable the vector unit during the save
	 */
	msr = mfmsr();
	mtmsr(msr | PSL_VEC);

	/*
	 * Save the vector registers and SPEFSCR to the PCB
	 */
#define EVSTDW(n)   __asm ("evstdw %1,0(%0)" \
		:: "b"(pcb->pcb_vec.vr[n]), "n"(n));
	EVSTDW(0);	EVSTDW(1);	EVSTDW(2);	EVSTDW(3);
	EVSTDW(4);	EVSTDW(5);	EVSTDW(6);	EVSTDW(7);
	EVSTDW(8);	EVSTDW(9);	EVSTDW(10);	EVSTDW(11);
	EVSTDW(12);	EVSTDW(13);	EVSTDW(14);	EVSTDW(15);
	EVSTDW(16);	EVSTDW(17);	EVSTDW(18);	EVSTDW(19);
	EVSTDW(20);	EVSTDW(21);	EVSTDW(22);	EVSTDW(23);
	EVSTDW(24);	EVSTDW(25);	EVSTDW(26);	EVSTDW(27);
	EVSTDW(28);	EVSTDW(29);	EVSTDW(30);	EVSTDW(31);
#undef EVSTDW

	__asm ( "evxor 0,0,0\n"
		"evmwumiaa 0,0,0\n"
		"evstdd 0,0(%0)" :: "b"(&pcb->pcb_vec.spare[0]));
	pcb->pcb_vec.vscr = mfspr(SPR_SPEFSCR);

	/*
	 * Disable vector unit again
	 */
	isync();
	mtmsr(msr);

}

void
enable_vec(struct thread *td)
{
	int	msr;
	struct	pcb *pcb;
	struct	trapframe *tf;

	pcb = td->td_pcb;
	tf = trapframe(td);

	/*
	 * Save the thread's SPE CPU number, and set the CPU's current
	 * vector thread
	 */
	td->td_pcb->pcb_veccpu = PCPU_GET(cpuid);
	PCPU_SET(vecthread, td);

	/*
	 * Enable the vector unit for when the thread returns from the
	 * exception. If this is the first time the unit has been used by
	 * the thread, initialise the vector registers and VSCR to 0, and
	 * set the flag to indicate that the vector unit is in use.
	 */
	tf->srr1 |= PSL_VEC;
	if (!(pcb->pcb_flags & PCB_VEC)) {
		memset(&pcb->pcb_vec, 0, sizeof pcb->pcb_vec);
		pcb->pcb_flags |= PCB_VEC;
		pcb->pcb_vec.vscr = mfspr(SPR_SPEFSCR);
	}

	/*
	 * Temporarily enable the vector unit so the registers
	 * can be restored.
	 */
	msr = mfmsr();
	mtmsr(msr | PSL_VEC);

	/* Restore SPEFSCR and ACC.  Use %r0 as the scratch for ACC. */
	mtspr(SPR_SPEFSCR, pcb->pcb_vec.vscr);
	__asm __volatile("isync;evldd 0, 0(%0); evmra 0,0\n"
	    :: "b"(&pcb->pcb_vec.spare[0]));

	/* 
	 * The lower half of each register will be restored on trap return.  Use
	 * %r0 as a scratch register, and restore it last.
	 */
#define	EVLDW(n)   __asm __volatile("evldw 0, 0(%0); evmergehilo "#n",0,"#n \
	    :: "b"(&pcb->pcb_vec.vr[n]));
	EVLDW(1);	EVLDW(2);	EVLDW(3);	EVLDW(4);
	EVLDW(5);	EVLDW(6);	EVLDW(7);	EVLDW(8);
	EVLDW(9);	EVLDW(10);	EVLDW(11);	EVLDW(12);
	EVLDW(13);	EVLDW(14);	EVLDW(15);	EVLDW(16);
	EVLDW(17);	EVLDW(18);	EVLDW(19);	EVLDW(20);
	EVLDW(21);	EVLDW(22);	EVLDW(23);	EVLDW(24);
	EVLDW(25);	EVLDW(26);	EVLDW(27);	EVLDW(28);
	EVLDW(29);	EVLDW(30);	EVLDW(31);	EVLDW(0);
#undef EVLDW

	isync();
	mtmsr(msr);
}

void
save_vec(struct thread *td)
{
	struct pcb *pcb;

	pcb = td->td_pcb;

	save_vec_int(td);

	/*
	 * Clear the current vec thread and pcb's CPU id
	 * XXX should this be left clear to allow lazy save/restore ?
	 */
	pcb->pcb_veccpu = INT_MAX;
	PCPU_SET(vecthread, NULL);
}

/*
 * Save SPE state without dropping ownership.  This will only save state if
 * the current vector-thread is `td'.  This is used for taking core dumps, so
 * don't leak kernel information; overwrite the low words of each vector with
 * their real value, taken from the thread's trap frame, unconditionally.
 */
void
save_vec_nodrop(struct thread *td)
{
	struct pcb *pcb;
	int i;

	if (td == PCPU_GET(vecthread))
		save_vec_int(td);

	pcb = td->td_pcb;

	for (i = 0; i < 32; i++) {
		pcb->pcb_vec.vr[i][1] =
		    td->td_frame ? td->td_frame->fixreg[i] : 0;
	}
}


#define	SPE_INST_MASK	0x31f
#define	EADD	0x200
#define	ESUB	0x201
#define	EABS	0x204
#define	ENABS	0x205
#define	ENEG	0x206
#define	EMUL	0x208
#define	EDIV	0x209
#define	ECMPGT	0x20c
#define	ECMPLT	0x20d
#define	ECMPEQ	0x20e
#define	ECFUI	0x210
#define	ECFSI	0x211
#define	ECTUI	0x214
#define	ECTSI	0x215
#define	ECTUF	0x216
#define	ECTSF	0x217
#define	ECTUIZ	0x218
#define	ECTSIZ	0x21a

#define	SPE		0x4
#define	SPFP		0x6
#define	DPFP		0x7

#define	SPE_OPC		4
#define	OPC_SHIFT	26

#define	EVFSADD		0x280
#define	EVFSSUB		0x281
#define	EVFSABS		0x284
#define	EVFSNABS	0x285
#define	EVFSNEG		0x286
#define	EVFSMUL		0x288
#define	EVFSDIV		0x289
#define	EVFSCMPGT	0x28c
#define	EVFSCMPLT	0x28d
#define	EVFSCMPEQ	0x28e
#define	EVFSCFUI	0x290
#define	EVFSCFSI	0x291
#define	EVFSCTUI	0x294
#define	EVFSCTSI	0x295
#define	EVFSCTUF	0x296
#define	EVFSCTSF	0x297
#define	EVFSCTUIZ	0x298
#define	EVFSCTSIZ	0x29a

#define	EFSADD		0x2c0
#define	EFSSUB		0x2c1
#define	EFSABS		0x2c4
#define	EFSNABS		0x2c5
#define	EFSNEG		0x2c6
#define	EFSMUL		0x2c8
#define	EFSDIV		0x2c9
#define	EFSCMPGT	0x2cc
#define	EFSCMPLT	0x2cd
#define	EFSCMPEQ	0x2ce
#define	EFSCFD		0x2cf
#define	EFSCFUI		0x2d0
#define	EFSCFSI		0x2d1
#define	EFSCTUI		0x2d4
#define	EFSCTSI		0x2d5
#define	EFSCTUF		0x2d6
#define	EFSCTSF		0x2d7
#define	EFSCTUIZ	0x2d8
#define	EFSCTSIZ	0x2da

#define	EFDADD		0x2e0
#define	EFDSUB		0x2e1
#define	EFDABS		0x2e4
#define	EFDNABS		0x2e5
#define	EFDNEG		0x2e6
#define	EFDMUL		0x2e8
#define	EFDDIV		0x2e9
#define	EFDCMPGT	0x2ec
#define	EFDCMPLT	0x2ed
#define	EFDCMPEQ	0x2ee
#define	EFDCFS		0x2ef
#define	EFDCFUI		0x2f0
#define	EFDCFSI		0x2f1
#define	EFDCTUI		0x2f4
#define	EFDCTSI		0x2f5
#define	EFDCTUF		0x2f6
#define	EFDCTSF		0x2f7
#define	EFDCTUIZ	0x2f8
#define	EFDCTSIZ	0x2fa

enum {
	NONE,
	SINGLE,
	DOUBLE,
	VECTOR,
};

static uint32_t fpscr_to_spefscr(uint32_t fpscr)
{
	uint32_t spefscr;

	spefscr = 0;

	if (fpscr & FPSCR_VX)
		spefscr |= SPEFSCR_FINV;
	if (fpscr & FPSCR_OX)
		spefscr |= SPEFSCR_FOVF;
	if (fpscr & FPSCR_UX)
		spefscr |= SPEFSCR_FUNF;
	if (fpscr & FPSCR_ZX)
		spefscr |= SPEFSCR_FDBZ;
	if (fpscr & FPSCR_XX)
		spefscr |= SPEFSCR_FX;

	return (spefscr);
}

/* Sign is 0 for unsigned, 1 for signed. */
static int
spe_to_int(struct fpemu *fpemu, struct fpn *fpn, uint32_t *val, int sign)
{
	uint32_t res[2];

	res[0] = fpu_ftox(fpemu, fpn, res);
	if (res[0] != UINT_MAX && res[0] != 0)
		fpemu->fe_cx |= FPSCR_OX;
	else if (sign == 0 && res[0] != 0)
		fpemu->fe_cx |= FPSCR_UX;
	else
		*val = res[1];

	return (0);
}

/* Masked instruction */
/*
 * For compare instructions, returns 1 if success, 0 if not.  For all others,
 * returns -1, or -2 if no result needs recorded.
 */
static int
spe_emu_instr(uint32_t instr, struct fpemu *fpemu,
    struct fpn **result, uint32_t *iresult)
{
	switch (instr & SPE_INST_MASK) {
	case EABS:
	case ENABS:
	case ENEG:
		/* Taken care of elsewhere. */
		break;
	case ECTUIZ:
		fpemu->fe_cx &= ~FPSCR_RN;
		fpemu->fe_cx |= FP_RZ;
	case ECTUI:
		spe_to_int(fpemu, &fpemu->fe_f2, iresult, 0);
		return (-1);
	case ECTSIZ:
		fpemu->fe_cx &= ~FPSCR_RN;
		fpemu->fe_cx |= FP_RZ;
	case ECTSI:
		spe_to_int(fpemu, &fpemu->fe_f2, iresult, 1);
		return (-1);
	case EADD:
		*result = fpu_add(fpemu);
		break;
	case ESUB:
		*result = fpu_sub(fpemu);
		break;
	case EMUL:
		*result = fpu_mul(fpemu);
		break;
	case EDIV:
		*result = fpu_div(fpemu);
		break;
	case ECMPGT:
		fpu_compare(fpemu, 0);
		if (fpemu->fe_cx & FPSCR_FG)
			return (1);
		return (0);
	case ECMPLT:
		fpu_compare(fpemu, 0);
		if (fpemu->fe_cx & FPSCR_FL)
			return (1);
		return (0);
	case ECMPEQ:
		fpu_compare(fpemu, 0);
		if (fpemu->fe_cx & FPSCR_FE)
			return (1);
		return (0);
	default:
		printf("Unknown instruction %x\n", instr);
	}

	return (-1);
}

static int
spe_explode(struct fpemu *fe, struct fpn *fp, uint32_t type,
    uint32_t hi, uint32_t lo)
{
	uint32_t s;

	fp->fp_sign = hi >> 31;
	fp->fp_sticky = 0;
	switch (type) {
	case SINGLE:
		s = fpu_stof(fp, hi);
		break;

	case DOUBLE:
		s = fpu_dtof(fp, hi, lo);
		break;
	}

	if (s == FPC_QNAN && (fp->fp_mant[0] & FP_QUIETBIT) == 0) {
		/*
		 * Input is a signalling NaN.  All operations that return
		 * an input NaN operand put it through a ``NaN conversion'',
		 * which basically just means ``turn on the quiet bit''.
		 * We do this here so that all NaNs internally look quiet
		 * (we can tell signalling ones by their class).
		 */
		fp->fp_mant[0] |= FP_QUIETBIT;
		fe->fe_cx = FPSCR_VXSNAN;	/* assert invalid operand */
		s = FPC_SNAN;
	}
	fp->fp_class = s;

	return (0);
}

/*
 * Save the high word of a 64-bit GPR for manipulation in the exception handler.
 */
static uint32_t
spe_save_reg_high(int reg)
{
	uint32_t vec[2];
#define EVSTDW(n)   case n: __asm __volatile ("evstdw %1,0(%0)" \
		:: "b"(vec), "n"(n) : "memory"); break;
	switch (reg) {
	EVSTDW(0);	EVSTDW(1);	EVSTDW(2);	EVSTDW(3);
	EVSTDW(4);	EVSTDW(5);	EVSTDW(6);	EVSTDW(7);
	EVSTDW(8);	EVSTDW(9);	EVSTDW(10);	EVSTDW(11);
	EVSTDW(12);	EVSTDW(13);	EVSTDW(14);	EVSTDW(15);
	EVSTDW(16);	EVSTDW(17);	EVSTDW(18);	EVSTDW(19);
	EVSTDW(20);	EVSTDW(21);	EVSTDW(22);	EVSTDW(23);
	EVSTDW(24);	EVSTDW(25);	EVSTDW(26);	EVSTDW(27);
	EVSTDW(28);	EVSTDW(29);	EVSTDW(30);	EVSTDW(31);
	}
#undef EVSTDW

	return (vec[0]);
}

/*
 * Load the given value into the high word of the requested register.
 */
static void
spe_load_reg_high(int reg, uint32_t val)
{
#define	EVLDW(n)   case n: __asm __volatile("evmergelo "#n",%0,"#n \
	    :: "r"(val)); break;
	switch (reg) {
	EVLDW(1);	EVLDW(2);	EVLDW(3);	EVLDW(4);
	EVLDW(5);	EVLDW(6);	EVLDW(7);	EVLDW(8);
	EVLDW(9);	EVLDW(10);	EVLDW(11);	EVLDW(12);
	EVLDW(13);	EVLDW(14);	EVLDW(15);	EVLDW(16);
	EVLDW(17);	EVLDW(18);	EVLDW(19);	EVLDW(20);
	EVLDW(21);	EVLDW(22);	EVLDW(23);	EVLDW(24);
	EVLDW(25);	EVLDW(26);	EVLDW(27);	EVLDW(28);
	EVLDW(29);	EVLDW(30);	EVLDW(31);	EVLDW(0);
	}
#undef EVLDW

}

void
spe_handle_fpdata(struct trapframe *frame)
{
	struct fpemu fpemu;
	struct fpn *result;
	uint32_t instr, instr_sec_op;
	uint32_t cr_shift, ra, rb, rd, src;
	uint32_t high, low, res, tmp; /* For vector operations. */
	uint32_t spefscr = 0;
	uint32_t ftod_res[2];
	int width; /* Single, Double, Vector, Integer */
	int err;
	uint32_t msr;

	err = fueword32((void *)frame->srr0, &instr);
	
	if (err != 0)
		return;
		/* Fault. */;

	if ((instr >> OPC_SHIFT) != SPE_OPC)
		return;

	msr = mfmsr();
	/*
	 * 'cr' field is the upper 3 bits of rd.  Magically, since a) rd is 5
	 * bits, b) each 'cr' field is 4 bits, and c) Only the 'GT' bit is
	 * modified for most compare operations, the full value of rd can be
	 * used as a shift value.
	 */
	rd = (instr >> 21) & 0x1f;
	ra = (instr >> 16) & 0x1f;
	rb = (instr >> 11) & 0x1f;
	src = (instr >> 5) & 0x7;
	cr_shift = 28 - (rd & 0x1f);

	instr_sec_op = (instr & 0x7ff);

	memset(&fpemu, 0, sizeof(fpemu));

	width = NONE;
	switch (src) {
	case SPE:
		mtmsr(msr | PSL_VEC);
		switch (instr_sec_op) {
		case EVFSABS:
			high = spe_save_reg_high(ra) & ~(1U << 31);
			frame->fixreg[rd] = frame->fixreg[ra] & ~(1U << 31);
			spe_load_reg_high(rd, high);
			break;
		case EVFSNABS:
			high = spe_save_reg_high(ra) | (1U << 31);
			frame->fixreg[rd] = frame->fixreg[ra] | (1U << 31);
			spe_load_reg_high(rd, high);
			break;
		case EVFSNEG:
			high = spe_save_reg_high(ra) ^ (1U << 31);
			frame->fixreg[rd] = frame->fixreg[ra] ^ (1U << 31);
			spe_load_reg_high(rd, high);
			break;
		default:
			/* High word */
			spe_explode(&fpemu, &fpemu.fe_f1, SINGLE,
			    spe_save_reg_high(ra), 0);
			spe_explode(&fpemu, &fpemu.fe_f2, SINGLE,
			    spe_save_reg_high(rb), 0);
			high = spe_emu_instr(instr_sec_op, &fpemu, &result,
			    &tmp);

			if (high < 0)
				spe_load_reg_high(rd, tmp);

			spefscr = fpscr_to_spefscr(fpemu.fe_cx) << 16;
			/* Clear the fpemu to start over on the lower bits. */
			memset(&fpemu, 0, sizeof(fpemu));

			/* Now low word */
			spe_explode(&fpemu, &fpemu.fe_f1, SINGLE,
			    frame->fixreg[ra], 0);
			spe_explode(&fpemu, &fpemu.fe_f2, SINGLE,
			    frame->fixreg[rb], 0);
			spefscr |= fpscr_to_spefscr(fpemu.fe_cx);
			low = spe_emu_instr(instr_sec_op, &fpemu, &result,
			    &frame->fixreg[rd]);
			if (instr_sec_op == EVFSCMPEQ ||
			    instr_sec_op == EVFSCMPGT ||
			    instr_sec_op == EVFSCMPLT) {
				res = (high << 3) | (low << 2) |
				    ((high | low) << 1) | (high & low);
				width = NONE;
			} else
				width = VECTOR;
			break;
		}
		goto end;

	case SPFP:
		switch (instr_sec_op) {
		case EFSABS:
			frame->fixreg[rd] = frame->fixreg[ra] & ~(1U << 31);
			break;
		case EFSNABS:
			frame->fixreg[rd] = frame->fixreg[ra] | (1U << 31);
			break;
		case EFSNEG:
			frame->fixreg[rd] = frame->fixreg[ra] ^ (1U << 31);
			break;
		case EFSCFD:
			mtmsr(msr | PSL_VEC);
			spe_explode(&fpemu, &fpemu.fe_f3, DOUBLE,
			    spe_save_reg_high(rb), frame->fixreg[rb]);
			result = &fpemu.fe_f3;
			width = SINGLE;
			break;
		default:
			spe_explode(&fpemu, &fpemu.fe_f1, SINGLE,
			    frame->fixreg[ra], 0);
			spe_explode(&fpemu, &fpemu.fe_f2, SINGLE,
			    frame->fixreg[rb], 0);
			width = SINGLE;
		}
		break;
	case DPFP:
		mtmsr(msr | PSL_VEC);
		switch (instr_sec_op) {
		case EFDABS:
			high = spe_save_reg_high(ra) & ~(1U << 31);
			frame->fixreg[rd] = frame->fixreg[ra];
			spe_load_reg_high(rd, high);
			break;
		case EFDNABS:
			high = spe_save_reg_high(ra) | (1U << 31);
			frame->fixreg[rd] = frame->fixreg[ra];
			spe_load_reg_high(rd, high);
			break;
		case EFDNEG:
			high = spe_save_reg_high(ra) ^ (1U << 31);
			frame->fixreg[rd] = frame->fixreg[ra];
			spe_load_reg_high(rd, high);
			break;
		case EFDCFS:
			spe_explode(&fpemu, &fpemu.fe_f3, SINGLE,
			    frame->fixreg[rb], 0);
			result = &fpemu.fe_f3;
			width = DOUBLE;
			break;
		default:
			spe_explode(&fpemu, &fpemu.fe_f1, DOUBLE,
			    spe_save_reg_high(ra), frame->fixreg[ra]);
			spe_explode(&fpemu, &fpemu.fe_f2, DOUBLE,
			    spe_save_reg_high(rb), frame->fixreg[rb]);
			width = DOUBLE;
		}
		break;
	}
	switch (instr_sec_op) {
	case EFDCFS:
	case EFSCFD:
		/* Already handled. */
		break;
	default:
		res = spe_emu_instr(instr_sec_op, &fpemu, &result,
		    &frame->fixreg[rd]);
		if (res != -1)
			res <<= 2;
		break;
	}

	switch (instr_sec_op & SPE_INST_MASK) {
	case ECMPEQ:
	case ECMPGT:
	case ECMPLT:
		frame->cr &= ~(0xf << cr_shift);
		frame->cr |= (res << cr_shift);
		break;
	case ECTUI:
	case ECTUIZ:
	case ECTSI:
	case ECTSIZ:
		break;
	default:
		switch (width) {
		case NONE:
		case VECTOR:
			break;
		case SINGLE:
			frame->fixreg[rd] = fpu_ftos(&fpemu, result);
			break;
		case DOUBLE:
			spe_load_reg_high(rd, fpu_ftod(&fpemu, result, ftod_res));
			frame->fixreg[rd] = ftod_res[1];
			break;
		default:
			panic("Unknown storage width %d", width);
			break;
		}
	}

end:
	spefscr |= (mfspr(SPR_SPEFSCR) & ~SPEFSCR_FINVS);
	mtspr(SPR_SPEFSCR, spefscr);
	frame->srr0 += 4;
	mtmsr(msr);

	return;
}

void
spe_handle_fpround(struct trapframe *frame)
{

	/*
	 * Punt fpround exceptions for now.  This leaves the truncated result in
	 * the register.  We'll deal with overflow/underflow later.
	 */
	return;
}