/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef	_CHEETAHASM_H
#define	_CHEETAHASM_H

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#ifdef	__cplusplus
extern "C" {
#endif

#ifdef _ASM
/* BEGIN CSTYLED */

#define	ASM_LD(reg, symbol)						\
	sethi	%hi(symbol), reg;					\
	ld	[reg + %lo(symbol)], reg;				\

#define	ASM_LDX(reg, symbol)						\
	sethi	%hi(symbol), reg;					\
	ldx	[reg + %lo(symbol)], reg;				\

#define	ASM_JMP(reg, symbol)						\
	sethi	%hi(symbol), reg;					\
	jmp	reg + %lo(symbol);					\
	nop

/*
 * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
 * ptr is in the machcpu structure.
 *  off_reg:  Register offset from 'cpu_private' ptr.
 *  scr1:    Scratch, ptr is returned in this register.
 *  scr2:    Scratch
 *  label:   Label to branch to if cpu_private ptr is null/zero.
 */
#define	GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)			\
	CPU_ADDR(scr1, scr2);						\
	ldn	[scr1 + CPU_PRIVATE], scr1;				\
	cmp	scr1, 0;						\
	be	label;							\
	  nop;								\
	add	scr1, off_reg, scr1

/*
 * Macro version of get_dcache_dtag.  We use this macro in the
 * CPU logout code. Since the Dcache is virtually indexed, only
 * bits [12:5] of the AFAR can be used so we need to search through
 * 8 indexes (4 ways + bit 13) in order to find the tag we want.
 *   afar:  input AFAR, not modified.
 *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
 *   scr1:  scratch.
 *   scr2:  scratch, will hold tag to look for.
 *   scr3:  used for Dcache index, loops through 4 ways.
 */
#define	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
	set	CH_DCACHE_IDX_MASK, scr3;				\
	and	afar, scr3, scr3;					\
	srlx	afar, CH_DCTAG_PA_SHIFT, scr2;				\
	b	1f;							\
	  or	scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */	\
	.align	128;							\
1:									\
	ldxa	[scr3]ASI_DC_TAG, scr1;		/* read tag */		\
	cmp	scr1, scr2;						\
	bne	4f;				/* not found? */	\
	  nop;								\
	stxa	scr3, [datap + CH_DC_IDX]%asi;	/* store index */	\
	stxa	scr1, [datap + CH_DC_TAG]%asi;	/* store tag */		\
	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
	ldxa	[scr3]ASI_DC_UTAG, scr1;	/* read utag */		\
	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
	stxa	scr1, [datap + CH_DC_UTAG]%asi;				\
	ldxa	[scr3]ASI_DC_SNP_TAG, scr1;	/* read snoop tag */	\
	stxa	scr1, [datap + CH_DC_SNTAG]%asi;			\
	add	datap, CH_DC_DATA, datap;				\
	clr	scr2;							\
2:									\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read data */		\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	stxa	scr1, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
	blt	2b;							\
	  add	scr2, 8, scr2;						\
									\
	GET_CPU_IMPL(scr2);	/* Parity bits are elsewhere for */	\
	cmp	scr2, PANTHER_IMPL;	/* panther processors. */	\
	bne,a	5f;			/* Done if not panther. */	\
	  add	datap, 8, datap; /* Skip to the end of the struct. */	\
	clr	scr2;							\
	add	datap, 7, datap; /* offset of the last parity byte */	\
	mov	1, scr1;						\
	sll	scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;		\
	or	scr3, scr1, scr3; /* add DC_data_parity bit to index */	\
3:									\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read parity bits */	\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	stba	scr1, [datap]%asi;					\
	dec	datap;							\
	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
	blt	3b;							\
	  add	scr2, 8, scr2;						\
	b	5f;							\
	  add	datap, 5, datap; /* set pointer to end of our struct */	\
4:									\
	set	CH_DCACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
	add	scr3, scr1, scr3;					\
	set	CH_DCACHE_IDX_LIMIT, scr1;	/* done? */		\
	cmp	scr3, scr1;						\
	blt	1b;							\
	  nop;								\
	add	datap, CH_DC_DATA_SIZE, datap;				\
5:

/*
 * Macro version of get_icache_dtag.  We use this macro in the CPU
 * logout code. If the Icache is on, we don't want to capture the data.
 *   afar:  input AFAR, not modified.
 *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
 *   scr1:  scratch.
 *   scr2:  scratch, will hold tag to look for.
 *   scr3:  used for Icache index, loops through 4 ways.
 * Note: For Panther, the Icache is virtually indexed and increases in
 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
 * of 32). This means the IC_addr index bits[14:7] for Panther now
 * correspond to VA bits[13:6]. But since it is virtually indexed, we
 * still mask out only bits[12:5] from the AFAR (we have to manually
 * check bit 13). In order to make this code work for all processors,
 * we end up checking twice as many indexes (8 instead of 4) as required
 * for non-Panther CPUs and saving off twice as much data (16 instructions
 * instead of just 8).
 */
#define	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
	ldxa	[%g0]ASI_DCU, scr1;					\
	btst	DCU_IC, scr1;		/* is Icache enabled? */	\
	bne,a	6f;			/* yes, don't capture */	\
	  add	datap, CH_IC_DATA_SIZE, datap;	/* anul if no branch */	\
	GET_CPU_IMPL(scr2);	/* Panther only uses VA[13:6] */	\
	cmp	scr2, PANTHER_IMPL;	/* and we also want to mask */	\
	be	1f;			/* out bit 13 since the */	\
	  nop;				/* Panther I$ is VIPT. */	\
	set	CH_ICACHE_IDX_MASK, scr3;				\
	b	2f;							\
	  nop;								\
1:									\
	set	PN_ICACHE_VA_IDX_MASK, scr3;				\
2:									\
	and	afar, scr3, scr3;					\
	sllx	scr3, CH_ICACHE_IDX_SHIFT, scr3;			\
	srlx	afar, CH_ICPATAG_SHIFT, scr2;	/* pa tag we want */	\
	andn	scr2, CH_ICPATAG_LBITS, scr2;	/* mask off lower */	\
	b	3f;							\
	  nop;								\
	.align	128;							\
3:									\
	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read pa tag */	\
	andn	scr1, CH_ICPATAG_LBITS, scr1;	/* mask off lower */	\
	cmp	scr1, scr2;						\
	bne	5f;				/* not found? */	\
	  nop;								\
	stxa	scr3, [datap + CH_IC_IDX]%asi;	/* store index */	\
	stxa	scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */	\
	add	scr3, CH_ICTAG_UTAG, scr3;	/* read utag */		\
	ldxa	[scr3]ASI_IC_TAG, scr1;					\
	add	scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;		\
	stxa	scr1, [datap + CH_IC_UTAG]%asi;				\
	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read upper tag */	\
	add	scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;		\
	stxa	scr1, [datap + CH_IC_UPPER]%asi;			\
	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read lower tag */	\
	andn	scr3, CH_ICTAG_TMASK, scr3;				\
	stxa	scr1, [datap + CH_IC_LOWER]%asi;			\
	ldxa	[scr3]ASI_IC_SNP_TAG, scr1;	/* read snoop tag */	\
	stxa	scr1, [datap + CH_IC_SNTAG]%asi;			\
	add	datap, CH_IC_DATA, datap;				\
	clr	scr2;							\
4:									\
	ldxa	[scr3 + scr2]ASI_IC_DATA, scr1;	/* read ins. data */	\
	stxa	scr1, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr2, PN_IC_DATA_REG_SIZE - 8;				\
	blt	4b;							\
	  add	scr2, 8, scr2;						\
	b	6f;							\
	  nop;								\
5:									\
	set	CH_ICACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
	add	scr3, scr1, scr3;					\
	set	PN_ICACHE_IDX_LIMIT, scr1;	/* done? */		\
	cmp	scr3, scr1;						\
	blt	3b;							\
	  nop;								\
	add	datap, CH_IC_DATA_SIZE, datap;				\
6:

#if defined(JALAPENO) || defined(SERRANO)
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:	input AFAR, not modified
 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   ec_way:	Constant value (way number) 
 *   scr1:      Scratch
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)		\
	mov	ec_way, scr1;						\
	and	scr1, JP_ECACHE_NWAY - 1, scr1;	/* mask E$ way bits */	\
	sllx	scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;			\
	set	((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;	\
	and	afar, scr2, scr3;		/* get set offset */	\
	andn	scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */	\
	or	scr3, scr1, scr3;		/* or WAY bits */	\
	b	1f;							\
	  stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
	.align	64;							\
1:									\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
	add	datap, CH_EC_DATA, datap;				\
2:									\
	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
	clr	scr1;							\
3:						/* loop thru 5 regs */	\
	ldxa	[scr1]ASI_EC_DATA, scr2;				\
	stxa	scr2, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
	bne	3b;							\
	   add	scr1, 8, scr1;						\
	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
	beq	2b;							\
	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3

#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\

/*
 * Jalapeno does not have cores so these macros are null.
 */
#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)

#if defined(JALAPENO)
/*
 * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
 * the fatal error bits are cleared.
 *	datap:		pointer to cpu logout structure.
 *	afar:		returned primary AFAR value.
 *	scr1:		scratch
 *	scr2:		scratch
 */
#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
	membar	#Sync

/*
 * Jalapeno has no shadow AFAR, null operation.
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)

#elif defined(SERRANO)
/*
 * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
 * the fatal error bits are cleared.  For Serrano, we also save the
 * AFAR2 register. 
 *	datap:	pointer to cpu logout structure.
 *	afar:	returned primary AFAR value.
 *	scr1:	scratch
 *	scr2:	scratch
 */
#define GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
	set	ASI_MCU_AFAR2_VA, scr1;					\
	ldxa	[scr1]ASI_MCU_CTRL, afar;				\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;	\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ 	\
	membar	#Sync

/*
 * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
 *      afar:   scratch, holds afar2.
 *      datap:  pointer to cpu logout structure
 *      scr1:   scratch
 *      scr2:   scratch
 *      scr3:   scratch
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;	\
	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;		\
	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	sub	datap, CH_CPU_LOGOUT_SIZE, datap
#endif /* SERRANO */

#elif defined(CHEETAH_PLUS)
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:	input AFAR, not modified.
 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   pn_way:	ecache way for panther (value = 0-3). For non-panther
 *		cpus, this macro will be called with pn_way = 0.
 *   scr1:	Scratch.
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)		\
	mov	afar, scr3;						\
	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
	mov	pn_way, scr1;	/* panther L3$ is 4-way so we ...    */	\
	sllx	scr1, PN_L3_WAY_SHIFT, scr1;	/* need to mask...   */	\
	or	scr3, scr1, scr3;	/* in the way bits <24:23>.  */	\
	b	1f;							\
	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
	.align	64;							\
1:									\
	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
	stxa     scr1, [datap + CH_EC_TAG]%asi;				\
	set	CHP_ECACHE_IDX_TAG_ECC, scr1;				\
	or	scr3, scr1, scr1;					\
	ldxa    [scr1]ASI_EC_DIAG, scr1;	/* get E$ tag ECC */	\
	stxa	scr1, [datap + CH_EC_TAG_ECC]%asi;			\
	add	datap, CH_EC_DATA, datap;				\
2:									\
	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
	clr	scr1;							\
3:						/* loop thru 5 regs */	\
	ldxa	[scr1]ASI_EC_DATA, scr2;				\
	stxa	scr2, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
	bne	3b;							\
	   add	scr1, 8, scr1;						\
	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
	beq	2b;							\
	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3

/*
 * If this is a panther, we need to make sure the sibling core is
 * parked so that we avoid any race conditions during diagnostic
 * accesses to the shared L2 and L3 caches.
 * dcucr_reg:	This register will be used to keep track of whether
 *		or not we need to unpark the core later.
 *		It just so happens that we also use this same register
 *		to keep track of our saved DCUCR value so we only touch
 *		bit 4 of the register (which is a "reserved" bit in the
 *		DCUCR) for keeping track of core parking.
 * scr1:	Scratch register.
 * scr2:	Scratch register.
 */
#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
	GET_CPU_IMPL(scr1);						\
	cmp	scr1, PANTHER_IMPL;	/* only park for panthers */	\
	bne,a	%xcc, 2f;						\
	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	set	ASI_CORE_RUNNING_STATUS, scr1;	/* check other core */	\
	ldxa	[scr1]ASI_CMP_SHARED, scr2;	/* is it running?   */	\
	cmp	scr2, PN_BOTH_CORES_RUNNING;				\
	bne,a	%xcc, 2f;	/* if not running, we are done */	\
	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	or	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	set	ASI_CORE_ID, scr1;					\
	ldxa	[scr1]ASI_CMP_PER_CORE, scr2;				\
	and	scr2, COREID_MASK, scr2;				\
	or	%g0, 1, scr1;		/* find out which core... */	\
	sll	scr1, scr2, scr2;	/* ... we need to park... */	\
1:									\
	set	ASI_CORE_RUNNING_RW, scr1;				\
	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ... and park it. */	\
	membar	#Sync;							\
	set	ASI_CORE_RUNNING_STATUS, scr1;	/* spin until... */	\
	ldxa	[scr1]ASI_CMP_SHARED, scr1;	/* ... the other...  */	\
	cmp	scr1, scr2;	/* ...core is parked according to... */	\
	bne,a	%xcc, 1b;	/* ...the core running status reg.  */	\
	  nop;								\
2:

/*
 * The core running this code will unpark its sibling core if the
 * sibling core had been parked by the current core earlier in this
 * trap handler.
 * dcucr_reg:	This register is used to keep track of whether or not
 *		we need to unpark our sibling core.
 *		It just so happens that we also use this same register
 *		to keep track of our saved DCUCR value so we only touch
 *		bit 4 of the register (which is a "reserved" bit in the
 *		DCUCR) for keeping track of core parking.
 * scr1:	Scratch register.
 * scr2:	Scratch register.
 */
#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
	btst	PN_PARKED_OTHER_CORE, dcucr_reg;			\
	bz,pt	%xcc, 1f;	/* if nothing to unpark, we are done */	\
	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	set	ASI_CORE_RUNNING_RW, scr1;				\
	set	PN_BOTH_CORES_RUNNING, scr2;	/* we want both...   */	\
	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ...cores running. */	\
	membar	#Sync;							\
1:

/*
 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
 * in the primary AFSR are cleared except the fatal error bits.  For Panther,
 * we also have to read and clear the AFSR_EXT, again leaving the fatal
 * error bits alone.
 *	datap:		pointer to cpu logout structure.
 *	afar:		returned primary AFAR value.
 *	scr1:		scratch
 *	scr2:		scratch
 */
#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
	set	ASI_SHADOW_REG_VA, scr1;				\
	ldxa	[scr1]ASI_AFAR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[scr1]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;	\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */ 	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
	membar	#Sync;							\
	GET_CPU_IMPL(scr1);						\
	cmp	scr1, PANTHER_IMPL;					\
	bne	%xcc, 1f;						\
	   nop;								\
	set	ASI_SHADOW_AFSR_EXT_VA, scr1;	/* shadow AFSR_EXT */	\
	ldxa	[scr1]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
	set	ASI_AFSR_EXT_VA, scr1;		/* primary AFSR_EXT */	\
	ldxa	[scr1]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;	\
	set	C_AFSR_EXT_FATAL_ERRS, scr1;				\
	bclr	scr1, scr2;	/* Clear fatal error bits here, */	\
	set	ASI_AFSR_EXT_VA, scr1;	/* so they're left */		\
	stxa	scr2, [scr1]ASI_AFSR;	/* as is in AFSR_EXT */		\
	membar	#Sync;							\
1:

/*
 * This macro is used in the CPU logout code to capture diagnostic
 * information from the L2 cache on panther processors.
 *   afar:	input AFAR, not modified.
 *   datap:	Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
 *   scr1:	Scratch.
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)		\
	mov	afar, scr3;						\
	set	PN_L2_INDEX_MASK, scr1;					\
	and	scr3, scr1, scr3;					\
	b	1f;	/* code to read tags and data should be ...  */	\
	   nop;		/* ...on the same cache line if possible.    */	\
	.align	128;	/* update this line if you add lines below. */	\
1:									\
	stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store L2$ index  */	\
	ldxa	[scr3]ASI_L2_TAG, scr1;		/* read the L2$ tag */	\
	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
	add	datap, CH_EC_DATA, datap;				\
	clr	scr1;							\
2:									\
	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
	stxa	scr2, [datap]%asi;		/* <511:256> of L2  */	\
	add	datap, 8, datap;		/* data and record  */	\
	cmp	scr1, (PN_L2_LINESIZE / 2) - 8;	/* it in the cpu    */	\
	bne	2b;				/* logout struct.   */	\
	  add	scr1, 8, scr1;						\
	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
	stxa	scr2, [datap]%asi;		/* ecc of <511:256> */	\
	add	datap, 8, datap;					\
3:									\
	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
	stxa	scr2, [datap]%asi;		/* <255:0> of L2    */	\
	add	datap, 8, datap;		/* data and record  */	\
	cmp	scr1, PN_L2_LINESIZE - 8;	/* it in the cpu    */	\
	bne	3b;				/* logout struct.   */	\
	  add	scr1, 8, scr1;						\
	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
	add	scr2, PN_L2_ECC_LO_REG, scr2;				\
	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
	stxa	scr2, [datap]%asi;		/* ecc of <255:0>.  */	\
	add	datap, 8, datap;		/* Advance pointer  */	\
	set	PN_L2_SET_SIZE, scr2;					\
	set	PN_L2_MAX_SET, scr1;					\
	cmp	scr1, scr3;	/* more ways to try for this line? */	\
	bg,a	%xcc, 1b;	/* if so, start over with next way */	\
	  add	scr3, scr2, scr3

/*
 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
 *	afar:	AFAR from access.
 *	datap:	pointer to cpu logout structure.
 *	scr1:	scratch
 *	scr2:	scratch
 *	scr3:	scratch
 */
#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
	GET_CPU_IMPL(scr1);						\
	cmp	scr1, PANTHER_IMPL;					\
	bne	%xcc, 4f;						\
	  nop;								\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	b	5f;							\
	  nop;								\
4:									\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_WAY_BIT(scr1, scr2);					\
	xor	afar, scr1, afar;					\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_WAY_BIT(scr1, scr2);		/* restore AFAR */	\
	xor	afar, scr1, afar;					\
	add	datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap;	\
	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
5:

/*
 * Cheetah+ needs to capture E$, D$ and I$ lines associated with
 * shadow afar.
 *	afar:	scratch, holds shadow afar.
 *	datap:	pointer to cpu logout structure
 *	scr1:	scratch
 *	scr2:	scratch
 *	scr3:	scratch
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
	ldxa	[datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;	\
	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;	\
	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	sub	datap, CH_CPU_LOGOUT_SIZE, datap

/*
 * Compute the "Way" bit for 2-way Ecache for Cheetah+.
 */
#define	GET_ECACHE_WAY_BIT(scr1, scr2)					\
	CPU_INDEX(scr1, scr2);						\
	mulx	scr1, CPU_NODE_SIZE, scr1;				\
	add	scr1, ECACHE_SIZE, scr1;				\
	set	cpunodes, scr2;						\
	ld	[scr1 + scr2], scr1;					\
	srlx	scr1, 1, scr1

#else /* CHEETAH_PLUS */
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:	input AFAR, not modified.
 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   scr1:      Scratch.
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
	mov	afar, scr3;						\
	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
	b	1f;							\
	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
	.align	64;							\
1:									\
	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
	add	datap, CH_EC_DATA, datap;				\
2:									\
	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
	clr	scr1;							\
3:						/* loop thru 5 regs */	\
	ldxa	[scr1]ASI_EC_DATA, scr2;				\
	stxa	scr2, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
	bne	3b;							\
	   add	scr1, 8, scr1;						\
	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
	beq	2b;							\
	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3

/*
 * Cheetah does not have cores so these macros are null.
 */
#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)

/*
 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
 * fatal error bits.
 *	datap:		pointer to cpu logout structure.
 *	afar:		returned primary AFAR value.
 *	scr1:		scratch
 *	scr2:		scratch
 */
#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)	\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
	membar	#Sync

/*
 * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
 *	afar:	AFAR from access.
 *	datap:	pointer to cpu logout structure.
 *	scr1:	scratch
 *	scr2:	scratch
 *	scr3:	scratch
 */
#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	add	datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap;	\
	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\

/*
 * Cheetah has no shadow AFAR, null operation.
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)

#endif	/* CHEETAH_PLUS */

/*
 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
 * logout data at TL>0. r_val is a register that returns the "failure count"
 * to the caller, and may be used as a scratch register until the end of
 * the macro.  afar is used to return the primary AFAR value to the caller
 * and it too can be used as a scratch register until the end. r_or_s is
 * a reg or symbol that has the offset within the "cpu_private" data area
 * to deposit the logout data.  t_flags is a register that has the
 * trap-type/trap-level/CEEN info. This t_flags register may be used after
 * the GET_AFSR_AFAR macro.
 *
 * The CPU logout operation will fail (r_val > 0) if the logout
 * structure in question is already being used. Otherwise, the CPU
 * logout operation will succeed (r_val = 0). For failures, r_val
 * returns the busy count (# of times we tried using this CPU logout
 * structure when it was busy.)
 *
 *   Register usage:
 *	%asi:   Must be set to either ASI_MEM if the address in datap
 *		is a physical address or to ASI_N if the address in
 *		datap is a virtual address.
 *	r_val:	This register is the return value which tells the
 *		caller whether or not the LOGOUT operation was successful.
 *		For failures, r_val returns the fail count (i.e. number of
 *		times we have tried to use this logout structure when it was
 *		already being used.
 *	afar:	output: contains AFAR on exit
 *	t_flags: input trap type info, may be used as scratch after stored
 *		to cpu log out structure.
 *	datap:	Points to log out data area.
 *	scr1:	Scratch
 *	scr2:	Scratch (may be r_val)
 *	scr3:   Scratch (may be t_flags)
 */
#define	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
	setx	LOGOUT_INVALID, scr2, scr1;				\
	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;	\
	cmp	scr2, scr1;						\
	bne	8f;							\
	  nop;								\
	stxa	t_flags, [datap + CH_CLO_FLAGS]%asi;			\
	GET_AFSR_AFAR(datap, afar, scr1, scr2);				\
	add	datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;		\
	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	sub	datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;		\
	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);			\
	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;	\
	set	0, r_val;	/* return value for success */		\
	ba	9f;							\
	  nop;								\
8:									\
	ldxa	[%g0]ASI_AFAR, afar;					\
	ldxa	[datap + CH_CLO_NEST_CNT]%asi, r_val;			\
	inc	r_val;		/* return value for failure */		\
	stxa	r_val, [datap + CH_CLO_NEST_CNT]%asi;			\
	membar	#Sync;							\
9:

/*
 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
 * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
 * up the expected data pointer in the scr1 register and sets the %asi
 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
 * is used at TL>0.
 *
 * The CPU logout operation will fail (r_val > 0) if the logout
 * structure in question is already being used. Otherwise, the CPU
 * logout operation will succeed (r_val = 0). For failures, r_val
 * returns the busy count (# of times we tried using this CPU logout
 * structure when it was busy.)
 *
 *   Register usage:
 *	r_val:	This register is the return value which tells the
 *		caller whether or not the LOGOUT operation was successful.
 *		For failures, r_val returns the fail count (i.e. number of
 *		times we have tried to use this logout structure when it was
 *		already being used.
 *	afar:	returns AFAR, used internally as afar value.
 *		output: if the cpu_private struct has not been initialized,
 *		        then we return the t_flags value listed below.
 *	r_or_s:	input offset, either register or constant (symbol).  It's
 *		OK for r_or_s to be a register as long as it's not scr1 or
 *		scr3.
 *	t_flags: input trap type info, may be used as scratch after stored
 *		to cpu log out structure.
 *	scr1:	Scratch, points to log out data area.
 *	scr2:	Scratch (may be r_or_s)
 *	scr3:	Scratch (may be r_val)
 *	scr4:   Scratch (may be t_flags)
 */
#define	DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
	GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
	wr	%g0, ASI_N, %asi;					\
	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4)	\
	ba	6f;							\
	  nop;								\
7:									\
	mov	t_flags, afar;		/* depends on afar = %g2  */	\
	set	0, r_val;		/* success in this case.  */	\
6:

/*
 * The P$ is flushed as a side effect of writing to the Primary
 * or Secondary Context Register. After writing to a context
 * register, every line of the P$ in the Valid state is invalidated,
 * regardless of which context it belongs to.
 * This routine simply touches the Primary context register by
 * reading the current value and writing it back. The Primary
 * context is not changed.
 */
#define	PCACHE_FLUSHALL(tmp1, tmp2, tmp3)				\
	sethi	%hi(FLUSH_ADDR), tmp1					;\
	set	MMU_PCONTEXT, tmp2					;\
	ldxa	[tmp2]ASI_DMMU, tmp3					;\
	stxa	tmp3, [tmp2]ASI_DMMU					;\
	flush	tmp1	/* See Cheetah PRM 8.10.2 */

/*
 * Macro that flushes the entire Dcache.
 *
 * arg1 = dcache size
 * arg2 = dcache linesize
 */
#define	CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)				\
	sub	arg1, arg2, tmp1;					\
1:									\
	stxa	%g0, [tmp1]ASI_DC_TAG;					\
	membar	#Sync;							\
	cmp	%g0, tmp1;						\
	bne,pt	%icc, 1b;						\
	  sub	tmp1, arg2, tmp1;

/*
 * Macro that flushes the entire Icache.
 *
 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
 * because accesses to ASI 0x67 interfere with Icache coherency.  We
 * must make sure the Icache is off, then turn it back on after the entire
 * cache has been invalidated.  If the Icache is originally off, we'll just
 * clear the tags but not turn the Icache on.
 *
 * arg1 = icache size
 * arg2 = icache linesize
 */
#define	CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)			\
	ldxa	[%g0]ASI_DCU, tmp2;					\
	andn	tmp2, DCU_IC, tmp1;					\
	stxa	tmp1, [%g0]ASI_DCU;					\
	flush	%g0;	/* flush required after changing the IC bit */	\
	sllx	arg2, 1, arg2;		/* arg2 = linesize * 2 */	\
	sllx	arg1, 1, arg1;		/* arg1 = size * 2 */		\
	sub	arg1, arg2, arg1;					\
	or	arg1, CH_ICTAG_LOWER, arg1;	/* "write" tag */	\
1:									\
	stxa	%g0, [arg1]ASI_IC_TAG;					\
	membar	#Sync;				/* Cheetah PRM 8.9.3 */	\
	cmp	arg1, CH_ICTAG_LOWER;					\
	bne,pt	%icc, 1b;						\
	  sub	arg1, arg2, arg1;					\
	stxa	tmp2, [%g0]ASI_DCU;					\
	flush	%g0;	/* flush required after changing the IC bit */


#if defined(JALAPENO) || defined(SERRANO)

/*
 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting 
 * with combinations of L2 snoops, victims and stores.
 *
 * A possible workaround is to surround each L2 ASI access with membars
 * and make sure that the code is hitting in the Icache.  This requires
 * aligning code sequence at E$ boundary and forcing I$ fetch by
 * jumping to selected offsets so that we don't take any I$ misses
 * during ASI access to the L2 tag or L2 flush.  This also requires
 * making sure that we don't take any interrupts or traps (such as
 * fast ECC trap, I$/D$ tag parity error) which can result in eviction
 * of this code sequence from I$, thus causing a miss.
 *
 * Because of the complexity/risk, we have decided to do a partial fix
 * of adding membar around each ASI access to the L2 tag or L2 flush.
 */

#define	JP_EC_DIAG_ACCESS_MEMBAR	\
	membar	#Sync

/*
 * Jalapeno version of macro that flushes the entire Ecache.
 *
 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize - not modified; can be an immediate constant.
 */
#define	ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)	\
	CPU_INDEX(tmp1, tmp2);						\
	set	JP_ECACHE_IDX_DISP_FLUSH, tmp2;				\
	sllx	tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;			\
	or	tmp1, tmp2, tmp1;					\
	srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;			\
1:									\
	subcc	tmp2, arg2, tmp2;					\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	ldxa	[tmp1 + tmp2]ASI_EC_DIAG, %g0;				\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	bg,pt	%xcc, 1b;						\
	  nop;								\
	mov	1, tmp2;						\
	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
	add	tmp1, tmp2, tmp1;					\
	mov	(JP_ECACHE_NWAY-1), tmp2;				\
	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
	andcc	tmp1, tmp2, tmp2;					\
	bnz,pt	%xcc, 1b;						\
	  srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2

#else	/* JALAPENO || SERRANO */

/*
 * Cheetah version of macro that flushes the entire Ecache.
 *
 *  Need to displacement flush 2x ecache size from Ecache flush area.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * arg3 = ecache flush address - for cheetah only
 */
#define	CH_ECACHE_FLUSHALL(arg1, arg2, arg3)				\
	sllx	arg1, 1, arg1;						\
1:									\
	subcc	arg1, arg2, arg1;					\
	bg,pt	%xcc, 1b;						\
	  ldxa	[arg1 + arg3]ASI_MEM, %g0;

/*
 * Cheetah+ version of macro that flushes the entire Ecache.
 *
 * Uses the displacement flush feature.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * impl = CPU implementation as returned from GET_CPU_IMPL()
 *        The value in this register is destroyed during execution
 *        of the macro.
 */
#if defined(CHEETAH_PLUS)
#define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)				\
	cmp	impl, PANTHER_IMPL;					\
	bne	%xcc, 1f;						\
	  nop;								\
	set	PN_L3_IDX_DISP_FLUSH, impl;				\
	b	2f;							\
	  nop;								\
1:									\
	set	CHP_ECACHE_IDX_DISP_FLUSH, impl;			\
2:									\
	subcc	arg1, arg2, arg1;					\
	bg,pt	%xcc, 2b;						\
	  ldxa	[arg1 + impl]ASI_EC_DIAG, %g0;
#else	/* CHEETAH_PLUS */
#define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
#endif	/* CHEETAH_PLUS */

/*
 * Macro that flushes the entire Ecache.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * arg3 = ecache flush address - for cheetah only
 */
#define	ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)				\
	GET_CPU_IMPL(tmp1);						\
	cmp	tmp1, CHEETAH_IMPL;					\
	bne	%xcc, 2f;						\
	  nop;								\
	CH_ECACHE_FLUSHALL(arg1, arg2, arg3);				\
	ba	3f;							\
	  nop;								\
2:									\
	CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);				\
3:

#endif	/* JALAPENO || SERRANO */

/*
 * Macro that flushes the Panther L2 cache.
 */
#if defined(CHEETAH_PLUS)
#define	PN_L2_FLUSHALL(scr1, scr2, scr3)				\
	GET_CPU_IMPL(scr3);						\
	cmp	scr3, PANTHER_IMPL;					\
	bne	%xcc, 2f;						\
	  nop;								\
	set	PN_L2_SIZE, scr1;					\
	set	PN_L2_LINESIZE, scr2;					\
	set	PN_L2_IDX_DISP_FLUSH, scr3;				\
1:									\
	subcc	scr1, scr2, scr1;					\
	bg,pt	%xcc, 1b;						\
	  ldxa	[scr1 + scr3]ASI_L2_TAG, %g0;				\
2:
#else	/* CHEETAH_PLUS */
#define	PN_L2_FLUSHALL(scr1, scr2, scr3)
#endif	/* CHEETAH_PLUS */

/*
 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
 * this macro returns the TLB index for that mapping based on a 512 entry
 * (2-way set associative) TLB. Aaside from the 16 entry fully associative
 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
 *  
 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
 * mask out all but the lower 8 bits because:
 *
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
 *
 * and
 *
 *    array index for   8K pages = VA[20:13]
 *    array index for  64K pages = VA[23:16]
 *    array index for 512K pages = VA[26:19]
 *    array index for   4M pages = VA[29:22]
 *    array index for  32M pages = VA[32:25]
 *    array index for 256M pages = VA[35:28]
 *
 * Inputs:
 *
 *    va	- Register.
 *		  Input: Virtual address in which we are interested.
 *		  Output: TLB index value.
 *    pg_sz	- Register. Page Size of the TLB in question as encoded
 *		  in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
 */
#if defined(CHEETAH_PLUS)
#define	PN_GET_TLB_INDEX(va, pg_sz)					\
	srlx	va, 13, va;	/* first shift the 13 bits and then */	\
	srlx	va, pg_sz, va;	/* shift by pg_sz three times. */	\
	srlx	va, pg_sz, va;						\
	srlx	va, pg_sz, va;						\
	and	va, 0xff, va;	/* mask out all but the lower 8 bits */
#endif	/* CHEETAH_PLUS */

/*
 * The following macros are for error traps at TL>0.
 * The issue with error traps at TL>0 is that there are no safely
 * available global registers.  So we use the trick of generating a
 * software trap, then using the %tpc, %tnpc and %tstate registers to
 * temporarily save the values of %g1 and %g2.
 */

/*
 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
 * Does the following steps:
 *	1. membar #Sync - required for USIII family errors.
 *	2. Specified software trap.
 * NB: Must be 8 instructions or less to fit in trap table and code must
 *     be relocatable.
 */
#define	CH_ERR_TL1_TRAPENTRY(trapno)		\
	membar	#Sync;				\
	ta	trapno;				\
	nop; nop; nop; nop; nop; nop

/*
 * Macro to generate 8-instruction trap table entry for TL>0 software trap.
 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
 * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
 * we need to put the low-order two bits of %g1 and %g2 in %tstate).
 * Note that %tstate has a reserved hole from bits 3-7, so we put the
 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
 * state bits).  Note that we must do a jmp instruction, since this
 * is moved into the trap table entry.
 * NB: Must be 8 instructions or less to fit in trap table and code must
 *     be relocatable.
 */
#define	CH_ERR_TL1_SWTRAPENTRY(label)		\
	wrpr	%g1, %tpc;			\
	and	%g1, 3, %g1;			\
	wrpr	%g2, %tnpc;			\
	sllx	%g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
	or	%g1, %g2, %g2;			\
	sethi	%hi(label), %g1;		\
	jmp	%g1+%lo(label);			\
	  wrpr	%g2, %tstate

/*
 * Macro to get ptr to ch_err_tl1_data.
 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
 * will point to a kernel nucleus virtual address with ASI_N in %asi.
 * This allows us to:
 *   1. Avoid getting MMU misses.  We may have gotten the original
 *	Fast ECC error in an MMU handler and if we get an MMU trap
 *	in the TL>0 handlers, we'll scribble on the MMU regs.
 *   2. Allows us to use the same code in the TL>0 handlers whether
 *	we're accessing kernel nucleus virtual addresses or physical
 *	addresses.
 * pseudo-code:
 *	reg1 <- ch_err_tl1_paddrs[CPUID];
 *	if (reg1 == NULL) {
 *		reg1 <- &ch_err_tl1_data
 *		%asi <- ASI_N
 *	} else {
 *		reg1 <- reg1 + offset +
 *		    sizeof (ch_err_tl1_data) * (%tl - 3)
 *		%asi <- ASI_MEM
 *	}
 */
#define	GET_CH_ERR_TL1_PTR(reg1, reg2, offset)	\
	CPU_INDEX(reg1, reg2);			\
	sllx	reg1, 3, reg1;			\
	set	ch_err_tl1_paddrs, reg2;	\
	ldx	[reg1+reg2], reg1;		\
	brnz	reg1, 1f;			\
	add	reg1, offset, reg1;		\
	set	ch_err_tl1_data, reg1;		\
	ba	2f;				\
	wr	%g0, ASI_N, %asi;		\
1:	rdpr	%tl, reg2;			\
	sub	reg2, 3, reg2;			\
	mulx	reg2, CH_ERR_TL1_DATA_SIZE, reg2;	\
	add	reg1, reg2, reg1;		\
	wr	%g0, ASI_MEM, %asi;		\
2:

/*
 * Macro to generate entry code for TL>0 error handlers.
 * At the end of this macro, %g1 will point to the ch_err_tl1_data
 * structure and %g2 will have the original flags in the ch_err_tl1_data
 * structure and %g5 will have the value of %tstate where the Fast ECC
 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
 * All %g registers except for %g1, %g2 and %g5 will be available after
 * this macro.
 * Does the following steps:
 *   1. Compute physical address of per-cpu/per-tl save area using
 *	only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
 *	leaving address in %g1 and updating the %asi register.
 *	If there is no data area available, we branch to label.
 *   2. Save %g3-%g7 in save area.
 *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
 *	original %g1+%g2 values (because we're going to change %tl).
 *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
 *	running at %tl+1 as small as possible.
 *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
 *	%tstate (%g5) and save in save area, carefully preserving %g5
 *	because it has the CH_ERR_TSTATE_DC_ON value.
 *   6. Load existing ch_err_tl1_data flags in %g2
 *   7. Compute the new flags
 *   8. If %g2 is non-zero (the structure was busy), shift the new
 *	flags by CH_ERR_ME_SHIFT and or them with the old flags.
 *   9. Store the updated flags into ch_err_tl1_data flags.
 *   10. If %g2 is non-zero, read the %tpc and store it in
 *	ch_err_tl1_data.
 */
#define	CH_ERR_TL1_ENTER(flags)			\
	GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);	\
	stxa	%g3, [%g1 + CH_ERR_TL1_G3]%asi;	\
	stxa	%g4, [%g1 + CH_ERR_TL1_G4]%asi;	\
	stxa	%g5, [%g1 + CH_ERR_TL1_G5]%asi;	\
	stxa	%g6, [%g1 + CH_ERR_TL1_G6]%asi;	\
	stxa	%g7, [%g1 + CH_ERR_TL1_G7]%asi;	\
	rdpr	%tpc, %g3;			\
	rdpr	%tnpc, %g4;			\
	rdpr	%tstate, %g5;			\
	rdpr	%tl, %g6;			\
	sub	%g6, 1, %g6;			\
	wrpr	%g6, %tl;			\
	and	%g5, 3, %g6;			\
	andn	%g3, 3, %g3;			\
	or	%g3, %g6, %g3;			\
	stxa	%g3, [%g1 + CH_ERR_TL1_G1]%asi;	\
	srlx	%g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6;	\
	and	%g6, 3, %g6;			\
	andn	%g4, 3, %g4;			\
	or	%g6, %g4, %g4;			\
	stxa	%g4, [%g1 + CH_ERR_TL1_G2]%asi;	\
	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
	set	flags | CH_ERR_TL, %g3;		\
	brz	%g2, 9f;			\
	sllx	%g3, CH_ERR_ME_SHIFT, %g4;	\
	or	%g2, %g4, %g3;			\
9:	stxa	%g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
	brnz	%g2, 8f;			\
	rdpr	%tpc, %g4;			\
	stxa	%g4, [%g1 + CH_ERR_TL1_TPC]%asi;	\
8:

/*
 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
 * at TL>0 handlers because the D$ may have corrupted data and we need to
 * turn off the I$ to allow for diagnostic accesses.  We then invoke
 * the normal entry macro and after it is done we save the values of
 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
 */
#define	CH_ERR_TL1_FECC_ENTER			\
	ldxa	[%g0]ASI_DCU, %g1;		\
	andn	%g1, DCU_DC + DCU_IC, %g2;	\
	stxa	%g2, [%g0]ASI_DCU;		\
	flush	%g0;	/* DCU_IC need flush */	\
	rdpr	%tstate, %g2;			\
	and	%g1, DCU_DC + DCU_IC, %g1;	\
	sllx	%g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;	\
	or	%g1, %g2, %g2;			\
	wrpr	%g2, %tstate;			\
	CH_ERR_TL1_ENTER(CH_ERR_FECC);		\
	and	%g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;	\
	stxa	%g5, [%g1 + CH_ERR_TL1_TMP]%asi

/*
 * Macro to generate exit code for TL>0 error handlers.
 * We fall into this macro if we've successfully logged the error in
 * the ch_err_tl1_data structure and want the PIL15 softint to pick
 * it up and log it.
 * Does the following steps:
 *   1.	Set pending flag for this cpu in ch_err_tl1_pending.
 *   2.	Write %set_softint with (1<<pil) to cause a pil level trap
 *   3.	Restore registers from ch_err_tl1_data, which is pointed to
 *	by %g1, last register to restore is %g1 since it's pointing
 *	to the save area.
 *   4. Execute retry
 */
#define	CH_ERR_TL1_EXIT				\
	CPU_INDEX(%g2, %g3);			\
	set	ch_err_tl1_pending, %g3;	\
	set	-1, %g4;			\
	stb	%g4, [%g2 + %g3];		\
	mov	1, %g2;				\
	sll	%g2, PIL_15, %g2;		\
	wr	%g2, SET_SOFTINT;		\
	ldxa	[%g1 + CH_ERR_TL1_G7]%asi, %g7;	\
	ldxa	[%g1 + CH_ERR_TL1_G6]%asi, %g6;	\
	ldxa	[%g1 + CH_ERR_TL1_G5]%asi, %g5;	\
	ldxa	[%g1 + CH_ERR_TL1_G4]%asi, %g4;	\
	ldxa	[%g1 + CH_ERR_TL1_G3]%asi, %g3;	\
	ldxa	[%g1 + CH_ERR_TL1_G2]%asi, %g2;	\
	ldxa	[%g1 + CH_ERR_TL1_G1]%asi, %g1;	\
	retry

/*
 * Generates unrecoverable error label for TL>0 handlers.
 * At label (Unrecoverable error routine)
 *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
 *	argument to cpu_tl1_err_panic).
 *   2.	Call cpu_tl1_err_panic via systrap at PIL 15
 */
#define	CH_ERR_TL1_PANIC_EXIT(label)		\
label:	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
	or	%g2, CH_ERR_TL | CH_ERR_PANIC, %g2;	\
	stxa	%g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
	set	cpu_tl1_err_panic, %g1;		\
	ba	sys_trap;			\
	  mov	PIL_15, %g4



/* END CSTYLED */
#endif	/* _ASM */

#ifdef	__cplusplus
}
#endif

#endif /* _CHEETAHASM_H */