/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/* common code with bug fixes from original version in trap.c */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/archsystm.h>
#include <sys/vmsystm.h>
#include <sys/fpu/fpusystm.h>
#include <sys/fpu/fpu_simulator.h>
#include <sys/inline.h>
#include <sys/debug.h>
#include <sys/privregs.h>
#include <sys/machpcb.h>
#include <sys/simulate.h>
#include <sys/proc.h>
#include <sys/cmn_err.h>
#include <sys/stack.h>
#include <sys/watchpoint.h>
#include <sys/trap.h>
#include <sys/machtrap.h>
#include <sys/mman.h>
#include <sys/asi.h>
#include <sys/copyops.h>
#include <vm/as.h>
#include <vm/page.h>
#include <sys/model.h>
#include <vm/seg_vn.h>
#include <sys/byteorder.h>

#define	IS_IBIT_SET(x)	(x & 0x2000)
#define	IS_VIS1(op, op3)(op == 2 && op3 == 0x36)
#define	IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(op, op3, asi)		\
		(op == 3 && (op3 == IOP_V8_LDDFA ||		\
		op3 == IOP_V8_STDFA) &&	asi > ASI_SNFL)

static int aligndebug = 0;

/*
 * For the sake of those who must be compatible with unaligned
 * architectures, users can link their programs to use a
 * corrective trap handler that will fix unaligned references
 * a special trap #6 (T_FIX_ALIGN) enables this 'feature'.
 * Returns 1 for success, 0 for failure.
 */

int
do_unaligned(struct regs *rp, caddr_t *badaddr)
{
	uint_t	inst, op3, asi = 0;
	uint_t	rd, rs1, rs2;
	int	sz, nf = 0, ltlend = 0;
	int	floatflg;
	int	fsrflg;
	int	immflg;
	int	lddstdflg;
	caddr_t	addr;
	uint64_t val;
	union {
		uint64_t	l[2];
		uint32_t	i[4];
		uint16_t	s[8];
		uint8_t		c[16];
	} data;

	ASSERT(USERMODE(rp->r_tstate));
	inst = fetch_user_instr((caddr_t)rp->r_pc);

	op3 = (inst >> 19) & 0x3f;
	rd = (inst >> 25) & 0x1f;
	rs1 = (inst >> 14) & 0x1f;
	rs2 = inst & 0x1f;
	floatflg = (inst >> 24) & 1;
	immflg = (inst >> 13) & 1;
	lddstdflg = fsrflg = 0;

	/* if not load or store do nothing */
	if ((inst >> 30) != 3)
		return (0);

	/* if ldstub or swap, do nothing */
	if ((inst & 0xc1680000) == 0xc0680000)
		return (0);

	/* if cas/casx, do nothing */
	if ((inst & 0xc1e00000) == 0xc1e00000)
		return (0);

	if (floatflg) {
		switch ((inst >> 19) & 3) {	/* map size bits to a number */
		case 0: sz = 4;
			break;			/* ldf{a}/stf{a} */
		case 1: fsrflg = 1;
			if (rd == 0)
				sz = 4;		/* ldfsr/stfsr */
			else  if (rd == 1)
				sz = 8;		/* ldxfsr/stxfsr */
			else
				return (SIMU_ILLEGAL);
			break;
		case 2: sz = 16;
			break;		/* ldqf{a}/stqf{a} */
		case 3: sz = 8;
			break;		/* lddf{a}/stdf{a} */
		}
		/*
		 * Fix to access extra double register encoding plus
		 * compensate to access the correct fpu_dreg.
		 */
		if ((sz > 4) && (fsrflg == 0)) {
			if ((rd & 1) == 1)
				rd = (rd & 0x1e) | 0x20;
			rd = rd >> 1;
			if ((sz == 16) && ((rd & 0x1) != 0))
				return (SIMU_ILLEGAL);
		}
	} else {
		int sz_bits = (inst >> 19) & 0xf;
		switch (sz_bits) {		/* map size bits to a number */
		case 0:				/* lduw{a} */
		case 4:				/* stw{a} */
		case 8:				/* ldsw{a} */
		case 0xf:			/* swap */
			sz = 4; break;
		case 1:				/* ldub{a} */
		case 5:				/* stb{a} */
		case 9:				/* ldsb{a} */
		case 0xd:			/* ldstub */
			sz = 1; break;
		case 2:				/* lduh{a} */
		case 6:				/* sth{a} */
		case 0xa:			/* ldsh{a} */
			sz = 2; break;
		case 3:				/* ldd{a} */
		case 7:				/* std{a} */
			lddstdflg = 1;
			sz = 8; break;
		case 0xb:			/* ldx{a} */
		case 0xe:			/* stx{a} */
			sz = 8; break;
		}
	}


	/* only support primary and secondary asi's */
	if ((op3 >> 4) & 1) {
		if (immflg) {
			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
					TSTATE_ASI_MASK;
		} else {
			asi = (inst >> 5) & 0xff;
		}
		switch (asi) {
		case ASI_P:
		case ASI_S:
			break;
		case ASI_PNF:
		case ASI_SNF:
			nf = 1;
			break;
		case ASI_PL:
		case ASI_SL:
			ltlend = 1;
			break;
		case ASI_PNFL:
		case ASI_SNFL:
			ltlend = 1;
			nf = 1;
			break;
		default:
			return (0);
		}
		/*
		 * Non-faulting stores generate a data_access_exception trap,
		 * according to the Spitfire manual, which should be signaled
		 * as an illegal instruction trap, because it can't be fixed.
		 */
		if ((nf) && ((op3 == IOP_V8_STQFA) || (op3 == IOP_V8_STDFA)))
			return (SIMU_ILLEGAL);
	}

	if (aligndebug) {
		printf("unaligned access at %p, instruction: 0x%x\n",
		    (void *)rp->r_pc, inst);
		printf("type %s", (((inst >> 21) & 1) ? "st" : "ld"));
		if (((inst >> 21) & 1) == 0)
		    printf(" %s", (((inst >> 22) & 1) ? "signed" : "unsigned"));
		printf(" asi 0x%x size %d immflg %d\n", asi, sz, immflg);
		printf("rd = %d, op3 = 0x%x, rs1 = %d, rs2 = %d, imm13=0x%x\n",
			rd, op3, rs1, rs2, (inst & 0x1fff));
	}

	(void) flush_user_windows_to_stack(NULL);
	if (getreg(rp, rs1, &val, badaddr))
		return (SIMU_FAULT);
	addr = (caddr_t)val;		/* convert to 32/64 bit address */
	if (aligndebug)
		printf("addr 1 = %p\n", (void *)addr);

	/* check immediate bit and use immediate field or reg (rs2) */
	if (immflg) {
		int imm;
		imm  = inst & 0x1fff;		/* mask out immediate field */
		imm <<= 19;			/* sign extend it */
		imm >>= 19;
		addr += imm;			/* compute address */
	} else {
		if (getreg(rp, rs2, &val, badaddr))
			return (SIMU_FAULT);
		addr += val;
	}

	/*
	 * If this is a 32-bit program, chop the address accordingly.
	 */
	if (curproc->p_model == DATAMODEL_ILP32)
		addr = (caddr_t)(caddr32_t)addr;

	if (aligndebug)
		printf("addr 2 = %p\n", (void *)addr);

	if (addr >= curproc->p_as->a_userlimit) {
		*badaddr = addr;
		goto badret;
	}

	/* a single bit differentiates ld and st */
	if ((inst >> 21) & 1) {			/* store */
		if (floatflg) {
			klwp_id_t lwp = ttolwp(curthread);
			kfpu_t *fp = lwptofpu(lwp);
			/* Ensure fp has been enabled */
			if (fpu_exists) {
				if (!(_fp_read_fprs() & FPRS_FEF))
					fp_enable();
			} else {
				if (!fp->fpu_en)
					fp_enable();
			}
			/* if fpu_exists read fpu reg */
			if (fpu_exists) {
				if (fsrflg) {
					_fp_read_pfsr(&data.l[0]);
				} else {
					if (sz == 4) {
						data.i[0] = 0;
						_fp_read_pfreg(
						    (unsigned *)&data.i[1], rd);
					}
					if (sz >= 8)
						_fp_read_pdreg(
							&data.l[0], rd);
					if (sz == 16)
						_fp_read_pdreg(
							&data.l[1], rd+1);
				}
			} else {
				if (fsrflg) {
					/* Clear reserved bits, set version=7 */
					fp->fpu_fsr &= ~0x30301000;
					fp->fpu_fsr |= 0xE0000;
					data.l[0] = fp->fpu_fsr;
				} else {
					if (sz == 4) {
						data.i[0] = 0;
						data.i[1] =
					    (unsigned)fp->fpu_fr.fpu_regs[rd];
					}
					if (sz >= 8)
						data.l[0] =
						    fp->fpu_fr.fpu_dregs[rd];
					if (sz == 16)
						data.l[1] =
						    fp->fpu_fr.fpu_dregs[rd+1];
				}
			}
		} else {
			if (lddstdflg) {		/* combine the data */
				if (getreg(rp, rd, &data.l[0], badaddr))
					return (SIMU_FAULT);
				if (getreg(rp, rd+1, &data.l[1], badaddr))
					return (SIMU_FAULT);
				if (ltlend) {
					/*
					 * For STD, each 32-bit word is byte-
					 * swapped individually.  For
					 * simplicity we don't want to do that
					 * below, so we swap the words now to
					 * get the desired result in the end.
					 */
					data.i[0] = data.i[3];
				} else {
					data.i[0] = data.i[1];
					data.i[1] = data.i[3];
				}
			} else {
				if (getreg(rp, rd, &data.l[0], badaddr))
					return (SIMU_FAULT);
			}
		}

		if (aligndebug) {
			if (sz == 16) {
				printf("data %x %x %x %x\n",
				    data.i[0], data.i[1], data.i[2], data.c[3]);
			} else {
				printf("data %x %x %x %x %x %x %x %x\n",
				    data.c[0], data.c[1], data.c[2], data.c[3],
				    data.c[4], data.c[5], data.c[6], data.c[7]);
			}
		}

		if (ltlend) {
			if (sz == 1) {
				if (xcopyout_little(&data.c[7], addr,
				    (size_t)sz) != 0)
					goto badret;
			} else if (sz == 2) {
				if (xcopyout_little(&data.s[3], addr,
				    (size_t)sz) != 0)
					goto badret;
			} else if (sz == 4) {
				if (xcopyout_little(&data.i[1], addr,
				    (size_t)sz) != 0)
					goto badret;
			} else {
				if (xcopyout_little(&data.l[0], addr,
				    (size_t)sz) != 0)
					goto badret;
			}
		} else {
			if (sz == 1) {
				if (copyout(&data.c[7], addr, (size_t)sz) == -1)
					goto badret;
			} else if (sz == 2) {
				if (copyout(&data.s[3], addr, (size_t)sz) == -1)
					goto badret;
			} else if (sz == 4) {
				if (copyout(&data.i[1], addr, (size_t)sz) == -1)
					goto badret;
			} else {
				if (copyout(&data.l[0], addr, (size_t)sz) == -1)
					goto badret;
			}
		}
	} else {				/* load */
		if (sz == 1) {
			if (ltlend) {
				if (xcopyin_little(addr, &data.c[7],
				    (size_t)sz) != 0) {
					if (nf)
						data.c[7] = 0;
					else
						goto badret;
				}
			} else {
				if (copyin(addr, &data.c[7],
				    (size_t)sz) == -1) {
					if (nf)
						data.c[7] = 0;
					else
						goto badret;
				}
			}
			/* if signed and the sign bit is set extend it */
			if (((inst >> 22) & 1) && ((data.c[7] >> 7) & 1)) {
				data.i[0] = (uint_t)-1;	/* extend sign bit */
				data.s[2] = (ushort_t)-1;
				data.c[6] = (uchar_t)-1;
			} else {
				data.i[0] = 0;	/* clear upper 32+24 bits */
				data.s[2] = 0;
				data.c[6] = 0;
			}
		} else if (sz == 2) {
			if (ltlend) {
				if (xcopyin_little(addr, &data.s[3],
				    (size_t)sz) != 0) {
					if (nf)
						data.s[3] = 0;
					else
						goto badret;
				}
			} else {
				if (copyin(addr, &data.s[3],
				    (size_t)sz) == -1) {
					if (nf)
						data.s[3] = 0;
					else
						goto badret;
				}
			}
			/* if signed and the sign bit is set extend it */
			if (((inst >> 22) & 1) && ((data.s[3] >> 15) & 1)) {
				data.i[0] = (uint_t)-1;	/* extend sign bit */
				data.s[2] = (ushort_t)-1;
			} else {
				data.i[0] = 0;	/* clear upper 32+16 bits */
				data.s[2] = 0;
			}
		} else if (sz == 4) {
			if (ltlend) {
				if (xcopyin_little(addr, &data.i[1],
				    (size_t)sz) != 0) {
					if (!nf)
						goto badret;
					data.i[1] = 0;
				}
			} else {
				if (copyin(addr, &data.i[1],
				    (size_t)sz) == -1) {
					if (!nf)
						goto badret;
					data.i[1] = 0;
				}
			}
			/* if signed and the sign bit is set extend it */
			if (((inst >> 22) & 1) && ((data.i[1] >> 31) & 1)) {
				data.i[0] = (uint_t)-1;	/* extend sign bit */
			} else {
				data.i[0] = 0;	/* clear upper 32 bits */
			}
		} else {
			if (ltlend) {
				if (xcopyin_little(addr, &data.l[0],
				    (size_t)sz) != 0) {
					if (!nf)
						goto badret;
					data.l[0] = 0;
				}
			} else {
				if (copyin(addr, &data.l[0],
				    (size_t)sz) == -1) {
					if (!nf)
						goto badret;
					data.l[0] = 0;
				}
			}
		}

		if (aligndebug) {
			if (sz == 16) {
				printf("data %x %x %x %x\n",
				    data.i[0], data.i[1], data.i[2], data.c[3]);
			} else {
				printf("data %x %x %x %x %x %x %x %x\n",
				    data.c[0], data.c[1], data.c[2], data.c[3],
				    data.c[4], data.c[5], data.c[6], data.c[7]);
			}
		}

		if (floatflg) {		/* if fpu_exists write fpu reg */
			klwp_id_t lwp = ttolwp(curthread);
			kfpu_t *fp = lwptofpu(lwp);
			/* Ensure fp has been enabled */
			if (fpu_exists) {
				if (!(_fp_read_fprs() & FPRS_FEF))
					fp_enable();
			} else {
				if (!fp->fpu_en)
					fp_enable();
			}
			/* if fpu_exists read fpu reg */
			if (fpu_exists) {
				if (fsrflg) {
					_fp_write_pfsr(&data.l[0]);
				} else {
					if (sz == 4)
						_fp_write_pfreg(
						    (unsigned *)&data.i[1], rd);
					if (sz >= 8)
						_fp_write_pdreg(
							&data.l[0], rd);
					if (sz == 16)
						_fp_write_pdreg(
							&data.l[1], rd+1);
				}
			} else {
				if (fsrflg) {
					fp->fpu_fsr = data.l[0];
				} else {
					if (sz == 4)
						fp->fpu_fr.fpu_regs[rd] =
							(unsigned)data.i[1];
					if (sz >= 8)
						fp->fpu_fr.fpu_dregs[rd] =
							data.l[0];
					if (sz == 16)
						fp->fpu_fr.fpu_dregs[rd+1] =
							data.l[1];
				}
			}
		} else {
			if (lddstdflg) {		/* split the data */
				if (ltlend) {
					/*
					 * For LDD, each 32-bit word is byte-
					 * swapped individually.  We didn't
					 * do that above, but this will give
					 * us the desired result.
					 */
					data.i[3] = data.i[0];
				} else {
					data.i[3] = data.i[1];
					data.i[1] = data.i[0];
				}
				data.i[0] = 0;
				data.i[2] = 0;
				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
					goto badret;
				if (putreg(&data.l[1], rp, rd+1, badaddr) == -1)
					goto badret;
			} else {
				if (putreg(&data.l[0], rp, rd, badaddr) == -1)
					goto badret;
			}
		}
	}
	return (SIMU_SUCCESS);
badret:
	return (SIMU_FAULT);
}


int
simulate_lddstd(struct regs *rp, caddr_t *badaddr)
{
	uint_t	inst, op3, asi = 0;
	uint_t	rd, rs1, rs2;
	int	rv = 0;
	int	nf = 0, ltlend = 0, usermode;
	int	immflg;
	uint64_t reven;
	uint64_t rodd;
	caddr_t	addr;
	uint64_t val;
	uint64_t data;

	usermode = USERMODE(rp->r_tstate);

	if (usermode)
		inst = fetch_user_instr((caddr_t)rp->r_pc);
	else
		inst = *(uint_t *)rp->r_pc;

	op3 = (inst >> 19) & 0x3f;
	rd = (inst >> 25) & 0x1f;
	rs1 = (inst >> 14) & 0x1f;
	rs2 = inst & 0x1f;
	immflg = (inst >> 13) & 1;

	if (USERMODE(rp->r_tstate))
		(void) flush_user_windows_to_stack(NULL);
	else
		flush_windows();

	if ((op3 >> 4) & 1) {		/* is this LDDA/STDA? */
		if (immflg) {
			asi = (uint_t)(rp->r_tstate >> TSTATE_ASI_SHIFT) &
					TSTATE_ASI_MASK;
		} else {
			asi = (inst >> 5) & 0xff;
		}
		switch (asi) {
		case ASI_P:
		case ASI_S:
			break;
		case ASI_PNF:
		case ASI_SNF:
			nf = 1;
			break;
		case ASI_PL:
		case ASI_SL:
			ltlend = 1;
			break;
		case ASI_PNFL:
		case ASI_SNFL:
			ltlend = 1;
			nf = 1;
			break;
		case ASI_AIUP:
		case ASI_AIUS:
			usermode = 1;
			break;
		case ASI_AIUPL:
		case ASI_AIUSL:
			usermode = 1;
			ltlend = 1;
			break;
		default:
			return (SIMU_ILLEGAL);
		}
	}

	if (getreg(rp, rs1, &val, badaddr))
		return (SIMU_FAULT);
	addr = (caddr_t)val;		/* convert to 32/64 bit address */

	/* check immediate bit and use immediate field or reg (rs2) */
	if (immflg) {
		int imm;
		imm  = inst & 0x1fff;		/* mask out immediate field */
		imm <<= 19;			/* sign extend it */
		imm >>= 19;
		addr += imm;			/* compute address */
	} else {
		if (getreg(rp, rs2, &val, badaddr))
			return (SIMU_FAULT);
		addr += val;
	}

	/*
	 * T_UNIMP_LDD and T_UNIMP_STD are higher priority than
	 * T_ALIGNMENT.  So we have to make sure that the address is
	 * kosher before trying to use it, because the hardware hasn't
	 * checked it for us yet.
	 */
	if (((uintptr_t)addr & 0x7) != 0) {
		if (curproc->p_fixalignment)
			return (do_unaligned(rp, badaddr));
		else
			return (SIMU_UNALIGN);
	}

	/*
	 * If this is a 32-bit program, chop the address accordingly.
	 */
	if (curproc->p_model == DATAMODEL_ILP32 && usermode)
		addr = (caddr_t)(caddr32_t)addr;

	if ((inst >> 21) & 1) {			/* store */
		if (getreg(rp, rd, &reven, badaddr))
			return (SIMU_FAULT);
		if (getreg(rp, rd+1, &rodd, badaddr))
			return (SIMU_FAULT);
		if (ltlend) {
			reven = BSWAP_32(reven);
			rodd  = BSWAP_32(rodd);
		}
		data = (reven << 32) | rodd;
		if (usermode) {
			if (suword64_nowatch(addr, data) == -1)
				return (SIMU_FAULT);
		} else {
			*(uint64_t *)addr = data;
		}
	} else {				/* load */
		if (usermode) {
			if (fuword64_nowatch(addr, &data)) {
				if (nf)
					data = 0;
				else
					return (SIMU_FAULT);
			}
		} else
			data = *(uint64_t *)addr;

		reven = (data >> 32);
		rodd  = (uint64_t)(uint32_t)data;
		if (ltlend) {
			reven = BSWAP_32(reven);
			rodd  = BSWAP_32(rodd);
		}

		if (putreg(&reven, rp, rd, badaddr) == -1)
			return (SIMU_FAULT);
		if (putreg(&rodd, rp, rd+1, badaddr) == -1)
			return (SIMU_FAULT);
	}
	return (SIMU_SUCCESS);
}


/*
 * simulate popc
 */
static int
simulate_popc(struct regs *rp, caddr_t *badaddr, uint_t inst)
{
	uint_t	rd, rs2, rs1;
	uint_t	immflg;
	uint64_t val, cnt = 0;

	rd = (inst >> 25) & 0x1f;
	rs1 = (inst >> 14) & 0x1f;
	rs2 = inst & 0x1f;
	immflg = (inst >> 13) & 1;

	if (rs1 > 0)
		return (SIMU_ILLEGAL);

	(void) flush_user_windows_to_stack(NULL);

	/* check immediate bit and use immediate field or reg (rs2) */
	if (immflg) {
		int64_t imm;
		imm  = inst & 0x1fff;		/* mask out immediate field */
		imm <<= 51;			/* sign extend it */
		imm >>= 51;
		if (imm != 0) {
			for (cnt = 0; imm != 0; imm &= imm-1)
				cnt++;
		}
	} else {
		if (getreg(rp, rs2, &val, badaddr))
			return (SIMU_FAULT);
		if (val != 0) {
			for (cnt = 0; val != 0; val &= val-1)
				cnt++;
		}
	}

	if (putreg(&cnt, rp, rd, badaddr) == -1)
		return (SIMU_FAULT);

	return (SIMU_SUCCESS);
}

/*
 * simulate unimplemented instructions (popc, ldqf{a}, stqf{a})
 */
int
simulate_unimp(struct regs *rp, caddr_t *badaddr)
{
	uint_t	inst, optype, op3, asi;
	uint_t	rs1, rd;
	uint_t	ignor, i;
	machpcb_t *mpcb = lwptompcb(ttolwp(curthread));
	int	nomatch = 0;
	caddr_t	addr = (caddr_t)rp->r_pc;
	struct as *as;
	caddr_t	ka;
	pfn_t	pfnum;
	page_t *pp;
	proc_t *p = ttoproc(curthread);
	struct seg *mapseg;
	struct segvn_data *svd;

	ASSERT(USERMODE(rp->r_tstate));
	inst = fetch_user_instr(addr);
	if (inst == (uint_t)-1) {
		mpcb->mpcb_illexcaddr = addr;
		mpcb->mpcb_illexcinsn = (uint32_t)-1;
		return (SIMU_ILLEGAL);
	}

	/*
	 * When fixing dirty v8 instructions there's a race if two processors
	 * are executing the dirty executable at the same time.  If one
	 * cleans the instruction as the other is executing it the second
	 * processor will see a clean instruction when it comes through this
	 * code and will return SIMU_ILLEGAL.  To work around the race
	 * this code will keep track of the last illegal instruction seen
	 * by each lwp and will only take action if the illegal instruction
	 * is repeatable.
	 */
	if (addr != mpcb->mpcb_illexcaddr ||
	    inst != mpcb->mpcb_illexcinsn)
		nomatch = 1;
	mpcb->mpcb_illexcaddr = addr;
	mpcb->mpcb_illexcinsn = inst;

	/* instruction fields */
	i = (inst >> 13) & 0x1;
	rd = (inst >> 25) & 0x1f;
	optype = (inst >> 30) & 0x3;
	op3 = (inst >> 19) & 0x3f;
	ignor = (inst >> 5) & 0xff;
	if (IS_IBIT_SET(inst)) {
		asi = (uint32_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
		    TSTATE_ASI_MASK);
	} else {
		asi = ignor;
	}

	if (IS_VIS1(optype, op3) ||
	    IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(optype, op3, asi)) {
		klwp_t *lwp = ttolwp(curthread);
		kfpu_t *fp = lwptofpu(lwp);
		if (fpu_exists) {
			if (!(_fp_read_fprs() & FPRS_FEF))
				fp_enable();
			_fp_read_pfsr(&fp->fpu_fsr);
		} else {
			if (!fp->fpu_en)
				fp_enable();
		}
		fp_precise(rp);
		return (SIMU_RETRY);
	}

	if (optype == 2 && op3 == IOP_V8_POPC) {
		return (simulate_popc(rp, badaddr, inst));
	} else if (optype == 3 && op3 == IOP_V8_POPC) {
		return (SIMU_ILLEGAL);
	}

	if (optype == OP_V8_LDSTR) {
		if (op3 == IOP_V8_LDQF || op3 == IOP_V8_LDQFA ||
		    op3 == IOP_V8_STQF || op3 == IOP_V8_STQFA)
			return (do_unaligned(rp, badaddr));
	}

	if (nomatch)
		return (SIMU_RETRY);

	/*
	 * The rest of the code handles v8 binaries with instructions
	 * that have dirty (non-zero) bits in reserved or 'ignored'
	 * fields; these will cause core dumps on v9 machines.
	 *
	 * We only clean dirty instructions in 32-bit programs (ie, v8)
	 * running on SPARCv9 processors.  True v9 programs are forced
	 * to use the instruction set as intended.
	 */
	if (lwp_getdatamodel(curthread->t_lwp) != DATAMODEL_ILP32)
		return (SIMU_ILLEGAL);
	switch (optype) {
	case OP_V8_BRANCH:
	case OP_V8_CALL:
		return (SIMU_ILLEGAL);	/* these don't have ignored fields */
		/*NOTREACHED*/
	case OP_V8_ARITH:
		switch (op3) {
		case IOP_V8_RETT:
			if (rd == 0 && !(i == 0 && ignor))
				return (SIMU_ILLEGAL);
			if (rd)
				inst &= ~(0x1f << 25);
			if (i == 0 && ignor)
				inst &= ~(0xff << 5);
			break;
		case IOP_V8_TCC:
			if (i == 0 && ignor != 0) {
				inst &= ~(0xff << 5);
			} else if (i == 1 && (((inst >> 7) & 0x3f) != 0)) {
				inst &= ~(0x3f << 7);
			} else {
				return (SIMU_ILLEGAL);
			}
			break;
		case IOP_V8_JMPL:
		case IOP_V8_RESTORE:
		case IOP_V8_SAVE:
			if ((op3 == IOP_V8_RETT && rd) ||
			    (i == 0 && ignor)) {
				inst &= ~(0xff << 5);
			} else {
				return (SIMU_ILLEGAL);
			}
			break;
		case IOP_V8_FCMP:
			if (rd == 0)
				return (SIMU_ILLEGAL);
			inst &= ~(0x1f << 25);
			break;
		case IOP_V8_RDASR:
			rs1 = ((inst >> 14) & 0x1f);
			if (rs1 == 1 || (rs1 >= 7 && rs1 <= 14)) {
				/*
				 * The instruction specifies an invalid
				 * state register - better bail out than
				 * "fix" it when we're not sure what was
				 * intended.
				 */
				return (SIMU_ILLEGAL);
			}
				/*
				 * Note: this case includes the 'stbar'
				 * instruction (rs1 == 15 && i == 0).
				 */
				if ((ignor = (inst & 0x3fff)) != 0)
					inst &= ~(0x3fff);
			break;
		case IOP_V8_SRA:
		case IOP_V8_SRL:
		case IOP_V8_SLL:
			if (ignor == 0)
				return (SIMU_ILLEGAL);
			inst &= ~(0xff << 5);
			break;
		case IOP_V8_ADD:
		case IOP_V8_AND:
		case IOP_V8_OR:
		case IOP_V8_XOR:
		case IOP_V8_SUB:
		case IOP_V8_ANDN:
		case IOP_V8_ORN:
		case IOP_V8_XNOR:
		case IOP_V8_ADDC:
		case IOP_V8_UMUL:
		case IOP_V8_SMUL:
		case IOP_V8_SUBC:
		case IOP_V8_UDIV:
		case IOP_V8_SDIV:
		case IOP_V8_ADDcc:
		case IOP_V8_ANDcc:
		case IOP_V8_ORcc:
		case IOP_V8_XORcc:
		case IOP_V8_SUBcc:
		case IOP_V8_ANDNcc:
		case IOP_V8_ORNcc:
		case IOP_V8_XNORcc:
		case IOP_V8_ADDCcc:
		case IOP_V8_UMULcc:
		case IOP_V8_SMULcc:
		case IOP_V8_SUBCcc:
		case IOP_V8_UDIVcc:
		case IOP_V8_SDIVcc:
		case IOP_V8_TADDcc:
		case IOP_V8_TSUBcc:
		case IOP_V8_TADDccTV:
		case IOP_V8_TSUBccTV:
		case IOP_V8_MULScc:
		case IOP_V8_WRASR:
		case IOP_V8_FLUSH:
			if (i != 0 || ignor == 0)
				return (SIMU_ILLEGAL);
			inst &= ~(0xff << 5);
			break;
		default:
			return (SIMU_ILLEGAL);
		}
		break;
	case OP_V8_LDSTR:
		switch (op3) {
		case IOP_V8_STFSR:
		case IOP_V8_LDFSR:
			if (rd == 0 && !(i == 0 && ignor))
				return (SIMU_ILLEGAL);
			if (rd)
				inst &= ~(0x1f << 25);
			if (i == 0 && ignor)
				inst &= ~(0xff << 5);
			break;
		default:
			if (optype == OP_V8_LDSTR && !IS_LDST_ALT(op3) &&
			    i == 0 && ignor)
				inst &= ~(0xff << 5);
			else
				return (SIMU_ILLEGAL);
			break;
		}
		break;
	default:
		return (SIMU_ILLEGAL);
	}

	as = p->p_as;

	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
	mapseg = as_findseg(as, (caddr_t)rp->r_pc, 0);
	ASSERT(mapseg != NULL);
	svd = (struct segvn_data *)mapseg->s_data;

	/*
	 * We only create COW page for MAP_PRIVATE mappings.
	 */
	SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER);
	if ((svd->type & MAP_TYPE) & MAP_SHARED) {
		SEGVN_LOCK_EXIT(as, &svd->lock);
		AS_LOCK_EXIT(as, &as->a_lock);
		return (SIMU_ILLEGAL);
	}
	SEGVN_LOCK_EXIT(as, &svd->lock);
	AS_LOCK_EXIT(as, &as->a_lock);

	/*
	 * A "flush" instruction using the user PC's vaddr will not work
	 * here, at least on Spitfire. Instead we create a temporary kernel
	 * mapping to the user's text page, then modify and flush that.
	 * Break COW by locking user page.
	 */
	if (as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK), PAGESIZE,
	    F_SOFTLOCK, S_READ))
		return (SIMU_FAULT);

	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
	pfnum = hat_getpfnum(as->a_hat, (caddr_t)rp->r_pc);
	AS_LOCK_EXIT(as, &as->a_lock);
	if (pf_is_memory(pfnum)) {
		pp = page_numtopp_nolock(pfnum);
		ASSERT(pp == NULL || PAGE_LOCKED(pp));
	} else {
		(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
		    PAGESIZE, F_SOFTUNLOCK, S_READ);
		return (SIMU_FAULT);
	}

	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
	ka = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)rp->r_pc);
	*(uint_t *)(ka + (uintptr_t)(rp->r_pc % PAGESIZE)) = inst;
	doflush(ka + (uintptr_t)(rp->r_pc % PAGESIZE));
	ppmapout(ka);
	AS_LOCK_EXIT(as, &as->a_lock);

	(void) as_fault(as->a_hat, as, (caddr_t)(rp->r_pc & PAGEMASK),
	    PAGESIZE, F_SOFTUNLOCK, S_READ);
	return (SIMU_RETRY);
}

/*
 * Get the value of a register for instruction simulation
 * by using the regs or window structure pointers.
 * Return 0 for success, and -1 for failure.  If there is a failure,
 * save the faulting address using badaddr pointer.
 * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
 * Don't truncate globals/outs for 32 bit programs, for v8+ support.
 */
int
getreg(struct regs *rp, uint_t reg, uint64_t *val, caddr_t *badaddr)
{
	uint64_t *rgs, *sp;
	int rv = 0;

	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
	if (reg == 0) {
		*val = 0;
	} else if (reg < 16) {
		*val = rgs[reg];
	} else if (IS_V9STACK(sp)) {
		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
		uint64_t *addr = (uint64_t *)&rw[reg - 16];
		uint64_t res;

		if (USERMODE(rp->r_tstate)) {
			if (fuword64_nowatch(addr, &res) == -1) {
				*badaddr = (caddr_t)addr;
				rv = -1;
			}
		} else {
			res = *addr;
		}
		*val = res;
	} else {
		uint32_t *rw = (uint32_t *)(caddr32_t)sp;
		uint32_t *addr = (uint32_t *)&rw[reg - 16];
		uint32_t res;

		if (USERMODE(rp->r_tstate)) {
			if (fuword32_nowatch(addr, &res) == -1) {
				*badaddr = (caddr_t)addr;
				rv = -1;
			}
		} else {
			res = *addr;
		}
		*val = (uint64_t)res;
	}
	return (rv);
}

/*
 * Set the value of a register after instruction simulation
 * by using the regs or window structure pointers.
 * Return 0 for succes -1 failure.
 * save the faulting address using badaddr pointer.
 * We have 64 bit globals and outs, and 32 or 64 bit ins and locals.
 * Don't truncate globals/outs for 32 bit programs, for v8+ support.
 */
int
putreg(uint64_t	*data, struct regs *rp, uint_t reg, caddr_t *badaddr)
{
	uint64_t *rgs, *sp;
	int rv = 0;

	rgs = (uint64_t *)&rp->r_ps;		/* globals and outs */
	sp = (uint64_t *)rp->r_sp;		/* ins and locals */
	if (reg == 0) {
		return (0);
	} else if (reg < 16) {
		rgs[reg] = *data;
	} else if (IS_V9STACK(sp)) {
		uint64_t *rw = (uint64_t *)((uintptr_t)sp + V9BIAS64);
		uint64_t *addr = (uint64_t *)&rw[reg - 16];
		uint64_t res;

		if (USERMODE(rp->r_tstate)) {
			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);

			res = *data;
			if (suword64_nowatch(addr, res) != 0) {
				*badaddr = (caddr_t)addr;
				rv = -1;
			}
			/*
			 * We have changed a local or in register;
			 * nuke the watchpoint return windows.
			 */
			mpcb->mpcb_rsp[0] = NULL;
			mpcb->mpcb_rsp[1] = NULL;
		} else {
			res = *data;
			*addr = res;
		}
	} else {
		uint32_t *rw = (uint32_t *)(caddr32_t)sp;
		uint32_t *addr = (uint32_t *)&rw[reg - 16];
		uint32_t res;

		if (USERMODE(rp->r_tstate)) {
			struct machpcb *mpcb = lwptompcb(curthread->t_lwp);

			res = (uint_t)*data;
			if (suword32_nowatch(addr, res) != 0) {
				*badaddr = (caddr_t)addr;
				rv = -1;
			}
			/*
			 * We have changed a local or in register;
			 * nuke the watchpoint return windows.
			 */
			mpcb->mpcb_rsp[0] = NULL;
			mpcb->mpcb_rsp[1] = NULL;

		} else {
			res = (uint_t)*data;
			*addr = res;
		}
	}
	return (rv);
}

/*
 * Calculate a memory reference address from instruction
 * operands, used to return the address of a fault, instead
 * of the instruction when an error occurs.  This is code that is
 * common with most of the routines that simulate instructions.
 */
int
calc_memaddr(struct regs *rp, caddr_t *badaddr)
{
	uint_t	inst;
	uint_t	rd, rs1, rs2;
	int	sz;
	int	immflg;
	int	floatflg;
	caddr_t  addr;
	uint64_t val;

	if (USERMODE(rp->r_tstate))
		inst = fetch_user_instr((caddr_t)rp->r_pc);
	else
		inst = *(uint_t *)rp->r_pc;

	rd = (inst >> 25) & 0x1f;
	rs1 = (inst >> 14) & 0x1f;
	rs2 = inst & 0x1f;
	floatflg = (inst >> 24) & 1;
	immflg = (inst >> 13) & 1;

	if (floatflg) {
		switch ((inst >> 19) & 3) {	/* map size bits to a number */
		case 0: sz = 4; break;		/* ldf/stf */
		case 1: return (0);		/* ld[x]fsr/st[x]fsr */
		case 2: sz = 16; break;		/* ldqf/stqf */
		case 3: sz = 8; break;		/* lddf/stdf */
		}
		/*
		 * Fix to access extra double register encoding plus
		 * compensate to access the correct fpu_dreg.
		 */
		if (sz > 4) {
			if ((rd & 1) == 1)
				rd = (rd & 0x1e) | 0x20;
			rd = rd >> 1;
		}
	} else {
		switch ((inst >> 19) & 0xf) {	/* map size bits to a number */
		case 0:				/* lduw */
		case 4:				/* stw */
		case 8:				/* ldsw */
		case 0xf:			/* swap */
			sz = 4; break;
		case 1:				/* ldub */
		case 5:				/* stb */
		case 9:				/* ldsb */
		case 0xd:			/* ldstub */
			sz = 1; break;
		case 2:				/* lduh */
		case 6:				/* sth */
		case 0xa:			/* ldsh */
			sz = 2; break;
		case 3:				/* ldd */
		case 7:				/* std */
		case 0xb:			/* ldx */
		case 0xe:			/* stx */
			sz = 8; break;
		}
	}

	if (USERMODE(rp->r_tstate))
		(void) flush_user_windows_to_stack(NULL);
	else
		flush_windows();

	if (getreg(rp, rs1, &val, badaddr))
		return (SIMU_FAULT);
	addr = (caddr_t)val;

	/* check immediate bit and use immediate field or reg (rs2) */
	if (immflg) {
		int imm;
		imm = inst & 0x1fff;		/* mask out immediate field */
		imm <<= 19;			/* sign extend it */
		imm >>= 19;
		addr += imm;			/* compute address */
	} else {
		if (getreg(rp, rs2, &val, badaddr))
			return (SIMU_FAULT);
		addr += val;
	}

	/*
	 * If this is a 32-bit program, chop the address accordingly.
	 */
	if (curproc->p_model == DATAMODEL_ILP32 &&
	    USERMODE(rp->r_tstate))
		addr = (caddr_t)(caddr32_t)addr;

	*badaddr = addr;
	return ((uintptr_t)addr & (sz - 1) ? SIMU_UNALIGN : SIMU_SUCCESS);
}

/*
 * Return the size of a load or store instruction (1, 2, 4, 8, 16, 64).
 * Also compute the precise address by instruction disassembly.
 * (v9 page faults only provide the page address via the hardware.)
 * Return 0 on failure (not a load or store instruction).
 */
int
instr_size(struct regs *rp, caddr_t *addrp, enum seg_rw rdwr)
{
	uint_t	inst, op3, asi;
	uint_t	rd, rs1, rs2;
	int	sz = 0;
	int	immflg;
	int	floatflg;
	caddr_t	addr;
	caddr_t badaddr;
	uint64_t val;

	if (rdwr == S_EXEC) {
		*addrp = (caddr_t)rp->r_pc;
		return (4);
	}

	/*
	 * Fetch the instruction from user-level.
	 * We would like to assert this:
	 *   ASSERT(USERMODE(rp->r_tstate));
	 * but we can't because we can reach this point from a
	 * register window underflow/overflow and the v9 wbuf
	 * traps call trap() with T_USER even though r_tstate
	 * indicates a system trap, not a user trap.
	 */
	inst = fetch_user_instr((caddr_t)rp->r_pc);

	op3 = (inst >> 19) & 0x3f;
	rd = (inst >> 25) & 0x1f;
	rs1 = (inst >> 14) & 0x1f;
	rs2 = inst & 0x1f;
	floatflg = (inst >> 24) & 1;
	immflg = (inst >> 13) & 1;

	/* if not load or store do nothing.  can't happen? */
	if ((inst >> 30) != 3)
		return (0);

	if (immflg)
		asi = (uint_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
				TSTATE_ASI_MASK);
	else
		asi = (inst >> 5) & 0xff;

	if (floatflg) {
		/* check for ld/st alternate and highest defined V9 asi */
		if ((op3 & 0x30) == 0x30 && asi > ASI_SNFL) {
			sz = extended_asi_size(asi);
		} else {
			switch (op3 & 3) {
			case 0:
				sz = 4;			/* ldf/stf/cas */
				break;
			case 1:
				if (rd == 0)
					sz = 4;		/* ldfsr/stfsr */
				else
					sz = 8;		/* ldxfsr/stxfsr */
				break;
			case 2:
				if (op3 == 0x3e)
					sz = 8;		/* casx */
				else
					sz = 16;	/* ldqf/stqf */
				break;
			case 3:
				sz = 8;			/* lddf/stdf */
				break;
			}
		}
	} else {
		switch (op3 & 0xf) {		/* map size bits to a number */
		case 0:				/* lduw */
		case 4:				/* stw */
		case 8:				/* ldsw */
		case 0xf:			/* swap */
			sz = 4; break;
		case 1:				/* ldub */
		case 5:				/* stb */
		case 9:				/* ldsb */
		case 0xd:			/* ldstub */
			sz = 1; break;
		case 2:				/* lduh */
		case 6:				/* sth */
		case 0xa:			/* ldsh */
			sz = 2; break;
		case 3:				/* ldd */
		case 7:				/* std */
		case 0xb:			/* ldx */
		case 0xe:			/* stx */
			sz = 8; break;
		}
	}

	if (sz == 0)	/* can't happen? */
		return (0);
	(void) flush_user_windows_to_stack(NULL);

	if (getreg(rp, rs1, &val, &badaddr))
		return (0);
	addr = (caddr_t)val;

	/* cas/casx don't use rs2 / simm13 to compute the address */
	if ((op3 & 0x3d) != 0x3c) {
		/* check immediate bit and use immediate field or reg (rs2) */
		if (immflg) {
			int imm;
			imm  = inst & 0x1fff;	/* mask out immediate field */
			imm <<= 19;		/* sign extend it */
			imm >>= 19;
			addr += imm;		/* compute address */
		} else {
			/*
			 * asi's in the 0xCx range are partial store
			 * instructions.  For these, rs2 is a mask, not part of
			 * the address.
			 */
			if (!(floatflg && (asi & 0xf0) == 0xc0)) {
				if (getreg(rp, rs2, &val, &badaddr))
					return (0);
				addr += val;
			}
		}
	}

	/*
	 * If this is a 32-bit program, chop the address accordingly.
	 */
	if (curproc->p_model == DATAMODEL_ILP32)
		addr = (caddr_t)(caddr32_t)addr;

	*addrp = addr;
	ASSERT(sz != 0);
	return (sz);
}

/*
 * Fetch an instruction from user-level.
 * Deal with watchpoints, if they are in effect.
 */
int32_t
fetch_user_instr(caddr_t vaddr)
{
	proc_t *p = curproc;
	int32_t instr;

	/*
	 * If this is a 32-bit program, chop the address accordingly.
	 */
	if (p->p_model == DATAMODEL_ILP32)
		vaddr = (caddr_t)(caddr32_t)vaddr;

	if (fuword32_nowatch(vaddr, (uint32_t *)&instr) == -1)
		instr = -1;

	return (instr);
}