xref: /titanic_41/usr/src/uts/sun4u/sys/cheetahasm.h (revision 0b6016e6ff70af39f99c9cc28e0c2207c8f5413c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	_CHEETAHASM_H
28 #define	_CHEETAHASM_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #ifdef	__cplusplus
33 extern "C" {
34 #endif
35 
36 #ifdef _ASM
37 /* BEGIN CSTYLED */
38 
39 #define	ASM_LD(reg, symbol)						\
40 	sethi	%hi(symbol), reg;					\
41 	ld	[reg + %lo(symbol)], reg;				\
42 
43 #define	ASM_LDX(reg, symbol)						\
44 	sethi	%hi(symbol), reg;					\
45 	ldx	[reg + %lo(symbol)], reg;				\
46 
47 #define	ASM_JMP(reg, symbol)						\
48 	sethi	%hi(symbol), reg;					\
49 	jmp	reg + %lo(symbol);					\
50 	nop
51 
52 /*
53  * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
54  * ptr is in the machcpu structure.
55  *  off_reg:  Register offset from 'cpu_private' ptr.
56  *  scr1:    Scratch, ptr is returned in this register.
57  *  scr2:    Scratch
58  *  label:   Label to branch to if cpu_private ptr is null/zero.
59  */
60 #define	GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)			\
61 	CPU_ADDR(scr1, scr2);						\
62 	ldn	[scr1 + CPU_PRIVATE], scr1;				\
63 	cmp	scr1, 0;						\
64 	be	label;							\
65 	  nop;								\
66 	add	scr1, off_reg, scr1
67 
68 /*
69  * Macro version of get_dcache_dtag.  We use this macro in the
70  * CPU logout code. Since the Dcache is virtually indexed, only
71  * bits [12:5] of the AFAR can be used so we need to search through
72  * 8 indexes (4 ways + bit 13) in order to find the tag we want.
73  *   afar:  input AFAR, not modified.
74  *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
75  *   scr1:  scratch.
76  *   scr2:  scratch, will hold tag to look for.
77  *   scr3:  used for Dcache index, loops through 4 ways.
78  */
79 #define	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
80 	set	CH_DCACHE_IDX_MASK, scr3;				\
81 	and	afar, scr3, scr3;					\
82 	srlx	afar, CH_DCTAG_PA_SHIFT, scr2;				\
83 	b	1f;							\
84 	  or	scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */	\
85 	.align	128;							\
86 1:									\
87 	ldxa	[scr3]ASI_DC_TAG, scr1;		/* read tag */		\
88 	cmp	scr1, scr2;						\
89 	bne	4f;				/* not found? */	\
90 	  nop;								\
91 	stxa	scr3, [datap + CH_DC_IDX]%asi;	/* store index */	\
92 	stxa	scr1, [datap + CH_DC_TAG]%asi;	/* store tag */		\
93 	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
94 	ldxa	[scr3]ASI_DC_UTAG, scr1;	/* read utag */		\
95 	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
96 	stxa	scr1, [datap + CH_DC_UTAG]%asi;				\
97 	ldxa	[scr3]ASI_DC_SNP_TAG, scr1;	/* read snoop tag */	\
98 	stxa	scr1, [datap + CH_DC_SNTAG]%asi;			\
99 	add	datap, CH_DC_DATA, datap;				\
100 	clr	scr2;							\
101 2:									\
102 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
103 	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read data */		\
104 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
105 	stxa	scr1, [datap]%asi;					\
106 	add	datap, 8, datap;					\
107 	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
108 	blt	2b;							\
109 	  add	scr2, 8, scr2;						\
110 									\
111 	GET_CPU_IMPL(scr2);	/* Parity bits are elsewhere for */	\
112 	cmp	scr2, PANTHER_IMPL;	/* panther processors. */	\
113 	bne,a	5f;			/* Done if not panther. */	\
114 	  add	datap, 8, datap; /* Skip to the end of the struct. */	\
115 	clr	scr2;							\
116 	add	datap, 7, datap; /* offset of the last parity byte */	\
117 	mov	1, scr1;						\
118 	sll	scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;		\
119 	or	scr3, scr1, scr3; /* add DC_data_parity bit to index */	\
120 3:									\
121 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
122 	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read parity bits */	\
123 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
124 	stba	scr1, [datap]%asi;					\
125 	dec	datap;							\
126 	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
127 	blt	3b;							\
128 	  add	scr2, 8, scr2;						\
129 	b	5f;							\
130 	  add	datap, 5, datap; /* set pointer to end of our struct */	\
131 4:									\
132 	set	CH_DCACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
133 	add	scr3, scr1, scr3;					\
134 	set	CH_DCACHE_IDX_LIMIT, scr1;	/* done? */		\
135 	cmp	scr3, scr1;						\
136 	blt	1b;							\
137 	  nop;								\
138 	add	datap, CH_DC_DATA_SIZE, datap;				\
139 5:
140 
141 /*
142  * Macro version of get_icache_dtag.  We use this macro in the CPU
143  * logout code. If the Icache is on, we don't want to capture the data.
144  *   afar:  input AFAR, not modified.
145  *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
146  *   scr1:  scratch.
147  *   scr2:  scratch, will hold tag to look for.
148  *   scr3:  used for Icache index, loops through 4 ways.
149  * Note: For Panther, the Icache is virtually indexed and increases in
150  * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
151  * of 32). This means the IC_addr index bits[14:7] for Panther now
152  * correspond to VA bits[13:6]. But since it is virtually indexed, we
153  * still mask out only bits[12:5] from the AFAR (we have to manually
154  * check bit 13). In order to make this code work for all processors,
155  * we end up checking twice as many indexes (8 instead of 4) as required
156  * for non-Panther CPUs and saving off twice as much data (16 instructions
157  * instead of just 8).
158  */
159 #define	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
160 	ldxa	[%g0]ASI_DCU, scr1;					\
161 	btst	DCU_IC, scr1;		/* is Icache enabled? */	\
162 	bne,a	6f;			/* yes, don't capture */	\
163 	  add	datap, CH_IC_DATA_SIZE, datap;	/* anul if no branch */	\
164 	GET_CPU_IMPL(scr2);	/* Panther only uses VA[13:6] */	\
165 	cmp	scr2, PANTHER_IMPL;	/* and we also want to mask */	\
166 	be	1f;			/* out bit 13 since the */	\
167 	  nop;				/* Panther I$ is VIPT. */	\
168 	set	CH_ICACHE_IDX_MASK, scr3;				\
169 	b	2f;							\
170 	  nop;								\
171 1:									\
172 	set	PN_ICACHE_VA_IDX_MASK, scr3;				\
173 2:									\
174 	and	afar, scr3, scr3;					\
175 	sllx	scr3, CH_ICACHE_IDX_SHIFT, scr3;			\
176 	srlx	afar, CH_ICPATAG_SHIFT, scr2;	/* pa tag we want */	\
177 	andn	scr2, CH_ICPATAG_LBITS, scr2;	/* mask off lower */	\
178 	b	3f;							\
179 	  nop;								\
180 	.align	128;							\
181 3:									\
182 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read pa tag */	\
183 	andn	scr1, CH_ICPATAG_LBITS, scr1;	/* mask off lower */	\
184 	cmp	scr1, scr2;						\
185 	bne	5f;				/* not found? */	\
186 	  nop;								\
187 	stxa	scr3, [datap + CH_IC_IDX]%asi;	/* store index */	\
188 	stxa	scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */	\
189 	add	scr3, CH_ICTAG_UTAG, scr3;	/* read utag */		\
190 	ldxa	[scr3]ASI_IC_TAG, scr1;					\
191 	add	scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;		\
192 	stxa	scr1, [datap + CH_IC_UTAG]%asi;				\
193 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read upper tag */	\
194 	add	scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;		\
195 	stxa	scr1, [datap + CH_IC_UPPER]%asi;			\
196 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read lower tag */	\
197 	andn	scr3, CH_ICTAG_TMASK, scr3;				\
198 	stxa	scr1, [datap + CH_IC_LOWER]%asi;			\
199 	ldxa	[scr3]ASI_IC_SNP_TAG, scr1;	/* read snoop tag */	\
200 	stxa	scr1, [datap + CH_IC_SNTAG]%asi;			\
201 	add	datap, CH_IC_DATA, datap;				\
202 	clr	scr2;							\
203 4:									\
204 	ldxa	[scr3 + scr2]ASI_IC_DATA, scr1;	/* read ins. data */	\
205 	stxa	scr1, [datap]%asi;					\
206 	add	datap, 8, datap;					\
207 	cmp	scr2, PN_IC_DATA_REG_SIZE - 8;				\
208 	blt	4b;							\
209 	  add	scr2, 8, scr2;						\
210 	b	6f;							\
211 	  nop;								\
212 5:									\
213 	set	CH_ICACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
214 	add	scr3, scr1, scr3;					\
215 	set	PN_ICACHE_IDX_LIMIT, scr1;	/* done? */		\
216 	cmp	scr3, scr1;						\
217 	blt	3b;							\
218 	  nop;								\
219 	add	datap, CH_IC_DATA_SIZE, datap;				\
220 6:
221 
222 #if defined(JALAPENO) || defined(SERRANO)
223 /*
224  * Macro version of get_ecache_dtag.  We use this macro in the
225  * CPU logout code.
226  *   afar:	input AFAR, not modified
227  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
228  *   ec_way:	Constant value (way number)
229  *   scr1:      Scratch
230  *   scr2:	Scratch.
231  *   scr3:	Scratch.
232  */
233 #define	GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)		\
234 	mov	ec_way, scr1;						\
235 	and	scr1, JP_ECACHE_NWAY - 1, scr1;	/* mask E$ way bits */	\
236 	sllx	scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;			\
237 	set	((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;	\
238 	and	afar, scr2, scr3;		/* get set offset */	\
239 	andn	scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */	\
240 	or	scr3, scr1, scr3;		/* or WAY bits */	\
241 	b	1f;							\
242 	  stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
243 	.align	64;							\
244 1:									\
245 	JP_EC_DIAG_ACCESS_MEMBAR;					\
246 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
247 	JP_EC_DIAG_ACCESS_MEMBAR;					\
248 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
249 	add	datap, CH_EC_DATA, datap;				\
250 2:									\
251 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
252 	clr	scr1;							\
253 3:						/* loop thru 5 regs */	\
254 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
255 	stxa	scr2, [datap]%asi;					\
256 	add	datap, 8, datap;					\
257 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
258 	bne	3b;							\
259 	   add	scr1, 8, scr1;						\
260 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
261 	beq	2b;							\
262 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
263 
264 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
265 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
266 	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
267 	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
268 	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
269 	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
270 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
271 
272 /*
273  * Jalapeno does not have cores so these macros are null.
274  */
275 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
276 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
277 
278 #if defined(JALAPENO)
279 /*
280  * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
281  * the fatal error bits are cleared.
282  *	datap:		pointer to cpu logout structure.
283  *	afar:		returned primary AFAR value.
284  *	scr1:		scratch
285  *	scr2:		scratch
286  */
287 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
288 	ldxa	[%g0]ASI_AFAR, afar;					\
289 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
290 	ldxa	[%g0]ASI_AFSR, scr2;					\
291 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
292 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
293 	sllx	scr1, 32, scr1;						\
294 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
295 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
296 	membar	#Sync
297 
298 /*
299  * Jalapeno has no shadow AFAR, null operation.
300  */
301 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
302 
303 #elif defined(SERRANO)
304 /*
305  * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
306  * the fatal error bits are cleared.  For Serrano, we also save the
307  * AFAR2 register.
308  *	datap:	pointer to cpu logout structure.
309  *	afar:	returned primary AFAR value.
310  *	scr1:	scratch
311  *	scr2:	scratch
312  */
313 #define GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
314 	set	ASI_MCU_AFAR2_VA, scr1;					\
315 	ldxa	[scr1]ASI_MCU_CTRL, afar;				\
316 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;	\
317 	ldxa	[%g0]ASI_AFAR, afar;					\
318 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
319 	ldxa	[%g0]ASI_AFSR, scr2;					\
320 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
321 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
322 	sllx	scr1, 32, scr1;						\
323 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
324 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ 	\
325 	membar	#Sync
326 
327 /*
328  * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
329  *      afar:   scratch, holds afar2.
330  *      datap:  pointer to cpu logout structure
331  *      scr1:   scratch
332  *      scr2:   scratch
333  *      scr3:   scratch
334  */
335 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
336 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;	\
337 	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;		\
338 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
339 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
340 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
341 	sub	datap, CH_CPU_LOGOUT_SIZE, datap
342 #endif /* SERRANO */
343 
344 #elif defined(CHEETAH_PLUS)
345 /*
346  * Macro version of get_ecache_dtag.  We use this macro in the
347  * CPU logout code.
348  *   afar:	input AFAR, not modified.
349  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
350  *   pn_way:	ecache way for panther (value = 0-3). For non-panther
351  *		cpus, this macro will be called with pn_way = 0.
352  *   scr1:	Scratch.
353  *   scr2:	Scratch.
354  *   scr3:	Scratch.
355  */
356 #define	GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)		\
357 	mov	afar, scr3;						\
358 	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
359 	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
360 	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
361 	mov	pn_way, scr1;	/* panther L3$ is 4-way so we ...    */	\
362 	sllx	scr1, PN_L3_WAY_SHIFT, scr1;	/* need to mask...   */	\
363 	or	scr3, scr1, scr3;	/* in the way bits <24:23>.  */	\
364 	b	1f;							\
365 	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
366 	.align	64;							\
367 1:									\
368 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
369 	stxa     scr1, [datap + CH_EC_TAG]%asi;				\
370 	set	CHP_ECACHE_IDX_TAG_ECC, scr1;				\
371 	or	scr3, scr1, scr1;					\
372 	ldxa    [scr1]ASI_EC_DIAG, scr1;	/* get E$ tag ECC */	\
373 	stxa	scr1, [datap + CH_EC_TAG_ECC]%asi;			\
374 	add	datap, CH_EC_DATA, datap;				\
375 2:									\
376 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
377 	clr	scr1;							\
378 3:						/* loop thru 5 regs */	\
379 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
380 	stxa	scr2, [datap]%asi;					\
381 	add	datap, 8, datap;					\
382 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
383 	bne	3b;							\
384 	   add	scr1, 8, scr1;						\
385 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
386 	beq	2b;							\
387 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
388 
389 /*
390  * If this is a panther, we need to make sure the sibling core is
391  * parked so that we avoid any race conditions during diagnostic
392  * accesses to the shared L2 and L3 caches.
393  * dcucr_reg:	This register will be used to keep track of whether
394  *		or not we need to unpark the core later.
395  *		It just so happens that we also use this same register
396  *		to keep track of our saved DCUCR value so we only touch
397  *		bit 4 of the register (which is a "reserved" bit in the
398  *		DCUCR) for keeping track of core parking.
399  * scr1:	Scratch register.
400  * scr2:	Scratch register.
401  */
402 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
403 	GET_CPU_IMPL(scr1);						\
404 	cmp	scr1, PANTHER_IMPL;	/* only park for panthers */	\
405 	bne,a	%xcc, 2f;						\
406 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
407 	set	ASI_CORE_RUNNING_STATUS, scr1;	/* check other core */	\
408 	ldxa	[scr1]ASI_CMP_SHARED, scr2;	/* is it running?   */	\
409 	cmp	scr2, PN_BOTH_CORES_RUNNING;				\
410 	bne,a	%xcc, 2f;	/* if not running, we are done */	\
411 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
412 	or	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
413 	set	ASI_CORE_ID, scr1;					\
414 	ldxa	[scr1]ASI_CMP_PER_CORE, scr2;				\
415 	and	scr2, COREID_MASK, scr2;				\
416 	or	%g0, 1, scr1;		/* find out which core... */	\
417 	sll	scr1, scr2, scr2;	/* ... we need to park... */	\
418 1:									\
419 	set	ASI_CORE_RUNNING_RW, scr1;				\
420 	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ... and park it. */	\
421 	membar	#Sync;							\
422 	set	ASI_CORE_RUNNING_STATUS, scr1;	/* spin until... */	\
423 	ldxa	[scr1]ASI_CMP_SHARED, scr1;	/* ... the other...  */	\
424 	cmp	scr1, scr2;	/* ...core is parked according to... */	\
425 	bne,a	%xcc, 1b;	/* ...the core running status reg.  */	\
426 	  nop;								\
427 2:
428 
429 /*
430  * The core running this code will unpark its sibling core if the
431  * sibling core had been parked by the current core earlier in this
432  * trap handler.
433  * dcucr_reg:	This register is used to keep track of whether or not
434  *		we need to unpark our sibling core.
435  *		It just so happens that we also use this same register
436  *		to keep track of our saved DCUCR value so we only touch
437  *		bit 4 of the register (which is a "reserved" bit in the
438  *		DCUCR) for keeping track of core parking.
439  * scr1:	Scratch register.
440  * scr2:	Scratch register.
441  */
442 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
443 	btst	PN_PARKED_OTHER_CORE, dcucr_reg;			\
444 	bz,pt	%xcc, 1f;	/* if nothing to unpark, we are done */	\
445 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
446 	set	ASI_CORE_RUNNING_RW, scr1;				\
447 	set	PN_BOTH_CORES_RUNNING, scr2;	/* we want both...   */	\
448 	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ...cores running. */	\
449 	membar	#Sync;							\
450 1:
451 
452 /*
453  * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
454  * in the primary AFSR are cleared except the fatal error bits.  For Panther,
455  * we also have to read and clear the AFSR_EXT, again leaving the fatal
456  * error bits alone.
457  *	datap:		pointer to cpu logout structure.
458  *	afar:		returned primary AFAR value.
459  *	scr1:		scratch
460  *	scr2:		scratch
461  */
462 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
463 	set	ASI_SHADOW_REG_VA, scr1;				\
464 	ldxa	[scr1]ASI_AFAR, scr2;					\
465 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;	\
466 	ldxa	[scr1]ASI_AFSR, scr2;					\
467 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;	\
468 	ldxa	[%g0]ASI_AFAR, afar;					\
469 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
470 	ldxa	[%g0]ASI_AFSR, scr2;					\
471 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
472 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
473 	sllx	scr1, 32, scr1;						\
474 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */ 	\
475 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
476 	membar	#Sync;							\
477 	GET_CPU_IMPL(scr1);						\
478 	cmp	scr1, PANTHER_IMPL;					\
479 	bne	%xcc, 1f;						\
480 	   nop;								\
481 	set	ASI_SHADOW_AFSR_EXT_VA, scr1;	/* shadow AFSR_EXT */	\
482 	ldxa	[scr1]ASI_AFSR, scr2;					\
483 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
484 	set	ASI_AFSR_EXT_VA, scr1;		/* primary AFSR_EXT */	\
485 	ldxa	[scr1]ASI_AFSR, scr2;					\
486 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;	\
487 	set	C_AFSR_EXT_FATAL_ERRS, scr1;				\
488 	bclr	scr1, scr2;	/* Clear fatal error bits here, */	\
489 	set	ASI_AFSR_EXT_VA, scr1;	/* so they're left */		\
490 	stxa	scr2, [scr1]ASI_AFSR;	/* as is in AFSR_EXT */		\
491 	membar	#Sync;							\
492 1:
493 
494 /*
495  * This macro is used in the CPU logout code to capture diagnostic
496  * information from the L2 cache on panther processors.
497  *   afar:	input AFAR, not modified.
498  *   datap:	Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
499  *   scr1:	Scratch.
500  *   scr2:	Scratch.
501  *   scr3:	Scratch.
502  */
503 #define	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)		\
504 	mov	afar, scr3;						\
505 	set	PN_L2_INDEX_MASK, scr1;					\
506 	and	scr3, scr1, scr3;					\
507 	b	1f;	/* code to read tags and data should be ...  */	\
508 	   nop;		/* ...on the same cache line if possible.    */	\
509 	.align	128;	/* update this line if you add lines below. */	\
510 1:									\
511 	stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store L2$ index  */	\
512 	ldxa	[scr3]ASI_L2_TAG, scr1;		/* read the L2$ tag */	\
513 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
514 	add	datap, CH_EC_DATA, datap;				\
515 	clr	scr1;							\
516 2:									\
517 	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
518 	stxa	scr2, [datap]%asi;		/* <511:256> of L2  */	\
519 	add	datap, 8, datap;		/* data and record  */	\
520 	cmp	scr1, (PN_L2_LINESIZE / 2) - 8;	/* it in the cpu    */	\
521 	bne	2b;				/* logout struct.   */	\
522 	  add	scr1, 8, scr1;						\
523 	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
524 	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
525 	stxa	scr2, [datap]%asi;		/* ecc of <511:256> */	\
526 	add	datap, 8, datap;					\
527 3:									\
528 	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
529 	stxa	scr2, [datap]%asi;		/* <255:0> of L2    */	\
530 	add	datap, 8, datap;		/* data and record  */	\
531 	cmp	scr1, PN_L2_LINESIZE - 8;	/* it in the cpu    */	\
532 	bne	3b;				/* logout struct.   */	\
533 	  add	scr1, 8, scr1;						\
534 	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
535 	add	scr2, PN_L2_ECC_LO_REG, scr2;				\
536 	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
537 	stxa	scr2, [datap]%asi;		/* ecc of <255:0>.  */	\
538 	add	datap, 8, datap;		/* Advance pointer  */	\
539 	set	PN_L2_SET_SIZE, scr2;					\
540 	set	PN_L2_MAX_SET, scr1;					\
541 	cmp	scr1, scr3;	/* more ways to try for this line? */	\
542 	bg,a	%xcc, 1b;	/* if so, start over with next way */	\
543 	  add	scr3, scr2, scr3
544 
545 /*
546  * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
547  *	afar:	AFAR from access.
548  *	datap:	pointer to cpu logout structure.
549  *	scr1:	scratch
550  *	scr2:	scratch
551  *	scr3:	scratch
552  */
553 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
554 	GET_CPU_IMPL(scr1);						\
555 	cmp	scr1, PANTHER_IMPL;					\
556 	bne	%xcc, 4f;						\
557 	  nop;								\
558 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
559 	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
560 	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
561 	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
562 	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
563 	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
564 	b	5f;							\
565 	  nop;								\
566 4:									\
567 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
568 	GET_ECACHE_WAY_BIT(scr1, scr2);					\
569 	xor	afar, scr1, afar;					\
570 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
571 	GET_ECACHE_WAY_BIT(scr1, scr2);		/* restore AFAR */	\
572 	xor	afar, scr1, afar;					\
573 	add	datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap;	\
574 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
575 5:
576 
577 /*
578  * Cheetah+ needs to capture E$, D$ and I$ lines associated with
579  * shadow afar.
580  *	afar:	scratch, holds shadow afar.
581  *	datap:	pointer to cpu logout structure
582  *	scr1:	scratch
583  *	scr2:	scratch
584  *	scr3:	scratch
585  */
586 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
587 	ldxa	[datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;	\
588 	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;	\
589 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
590 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
591 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
592 	sub	datap, CH_CPU_LOGOUT_SIZE, datap
593 
594 /*
595  * Compute the "Way" bit for 2-way Ecache for Cheetah+.
596  */
597 #define	GET_ECACHE_WAY_BIT(scr1, scr2)					\
598 	CPU_INDEX(scr1, scr2);						\
599 	mulx	scr1, CPU_NODE_SIZE, scr1;				\
600 	add	scr1, ECACHE_SIZE, scr1;				\
601 	set	cpunodes, scr2;						\
602 	ld	[scr1 + scr2], scr1;					\
603 	srlx	scr1, 1, scr1
604 
605 #else /* CHEETAH_PLUS */
606 /*
607  * Macro version of get_ecache_dtag.  We use this macro in the
608  * CPU logout code.
609  *   afar:	input AFAR, not modified.
610  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
611  *   scr1:      Scratch.
612  *   scr2:	Scratch.
613  *   scr3:	Scratch.
614  */
615 #define	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
616 	mov	afar, scr3;						\
617 	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
618 	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
619 	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
620 	b	1f;							\
621 	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
622 	.align	64;							\
623 1:									\
624 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
625 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
626 	add	datap, CH_EC_DATA, datap;				\
627 2:									\
628 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
629 	clr	scr1;							\
630 3:						/* loop thru 5 regs */	\
631 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
632 	stxa	scr2, [datap]%asi;					\
633 	add	datap, 8, datap;					\
634 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
635 	bne	3b;							\
636 	   add	scr1, 8, scr1;						\
637 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
638 	beq	2b;							\
639 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
640 
641 /*
642  * Cheetah does not have cores so these macros are null.
643  */
644 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
645 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
646 
647 /*
648  * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
649  * fatal error bits.
650  *	datap:		pointer to cpu logout structure.
651  *	afar:		returned primary AFAR value.
652  *	scr1:		scratch
653  *	scr2:		scratch
654  */
655 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)	\
656 	ldxa	[%g0]ASI_AFAR, afar;					\
657 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
658 	ldxa	[%g0]ASI_AFSR, scr2;					\
659 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
660 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
661 	sllx	scr1, 32, scr1;						\
662 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
663 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
664 	membar	#Sync
665 
666 /*
667  * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
668  *	afar:	AFAR from access.
669  *	datap:	pointer to cpu logout structure.
670  *	scr1:	scratch
671  *	scr2:	scratch
672  *	scr3:	scratch
673  */
674 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
675 	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
676 	add	datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap;	\
677 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
678 
679 /*
680  * Cheetah has no shadow AFAR, null operation.
681  */
682 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
683 
684 #endif	/* CHEETAH_PLUS */
685 
686 /*
687  * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
688  * logout data at TL>0. r_val is a register that returns the "failure count"
689  * to the caller, and may be used as a scratch register until the end of
690  * the macro.  afar is used to return the primary AFAR value to the caller
691  * and it too can be used as a scratch register until the end. r_or_s is
692  * a reg or symbol that has the offset within the "cpu_private" data area
693  * to deposit the logout data.  t_flags is a register that has the
694  * trap-type/trap-level/CEEN info. This t_flags register may be used after
695  * the GET_AFSR_AFAR macro.
696  *
697  * The CPU logout operation will fail (r_val > 0) if the logout
698  * structure in question is already being used. Otherwise, the CPU
699  * logout operation will succeed (r_val = 0). For failures, r_val
700  * returns the busy count (# of times we tried using this CPU logout
701  * structure when it was busy.)
702  *
703  *   Register usage:
704  *	%asi:   Must be set to either ASI_MEM if the address in datap
705  *		is a physical address or to ASI_N if the address in
706  *		datap is a virtual address.
707  *	r_val:	This register is the return value which tells the
708  *		caller whether or not the LOGOUT operation was successful.
709  *		For failures, r_val returns the fail count (i.e. number of
710  *		times we have tried to use this logout structure when it was
711  *		already being used.
712  *	afar:	output: contains AFAR on exit
713  *	t_flags: input trap type info, may be used as scratch after stored
714  *		to cpu log out structure.
715  *	datap:	Points to log out data area.
716  *	scr1:	Scratch
717  *	scr2:	Scratch (may be r_val)
718  *	scr3:   Scratch (may be t_flags)
719  */
720 #define	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
721 	setx	LOGOUT_INVALID, scr2, scr1;				\
722 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;	\
723 	cmp	scr2, scr1;						\
724 	bne	8f;							\
725 	  nop;								\
726 	stxa	t_flags, [datap + CH_CLO_FLAGS]%asi;			\
727 	GET_AFSR_AFAR(datap, afar, scr1, scr2);				\
728 	add	datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;		\
729 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
730 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
731 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
732 	sub	datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;		\
733 	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);			\
734 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;	\
735 	set	0, r_val;	/* return value for success */		\
736 	ba	9f;							\
737 	  nop;								\
738 8:									\
739 	ldxa	[%g0]ASI_AFAR, afar;					\
740 	ldxa	[datap + CH_CLO_NEST_CNT]%asi, r_val;			\
741 	inc	r_val;		/* return value for failure */		\
742 	stxa	r_val, [datap + CH_CLO_NEST_CNT]%asi;			\
743 	membar	#Sync;							\
744 9:
745 
746 /*
747  * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
748  * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
749  * up the expected data pointer in the scr1 register and sets the %asi
750  * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
751  * is used at TL>0.
752  *
753  * The CPU logout operation will fail (r_val > 0) if the logout
754  * structure in question is already being used. Otherwise, the CPU
755  * logout operation will succeed (r_val = 0). For failures, r_val
756  * returns the busy count (# of times we tried using this CPU logout
757  * structure when it was busy.)
758  *
759  *   Register usage:
760  *	r_val:	This register is the return value which tells the
761  *		caller whether or not the LOGOUT operation was successful.
762  *		For failures, r_val returns the fail count (i.e. number of
763  *		times we have tried to use this logout structure when it was
764  *		already being used.
765  *	afar:	returns AFAR, used internally as afar value.
766  *		output: if the cpu_private struct has not been initialized,
767  *		        then we return the t_flags value listed below.
768  *	r_or_s:	input offset, either register or constant (symbol).  It's
769  *		OK for r_or_s to be a register as long as it's not scr1 or
770  *		scr3.
771  *	t_flags: input trap type info, may be used as scratch after stored
772  *		to cpu log out structure.
773  *	scr1:	Scratch, points to log out data area.
774  *	scr2:	Scratch (may be r_or_s)
775  *	scr3:	Scratch (may be r_val)
776  *	scr4:   Scratch (may be t_flags)
777  */
778 #define	DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
779 	GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
780 	wr	%g0, ASI_N, %asi;					\
781 	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4)	\
782 	ba	6f;							\
783 	  nop;								\
784 7:									\
785 	mov	t_flags, afar;		/* depends on afar = %g2  */	\
786 	set	0, r_val;		/* success in this case.  */	\
787 6:
788 
789 /*
790  * The P$ is flushed as a side effect of writing to the Primary
791  * or Secondary Context Register. After writing to a context
792  * register, every line of the P$ in the Valid state is invalidated,
793  * regardless of which context it belongs to.
794  * This routine simply touches the Primary context register by
795  * reading the current value and writing it back. The Primary
796  * context is not changed.
797  */
798 #define	PCACHE_FLUSHALL(tmp1, tmp2, tmp3)				\
799 	sethi	%hi(FLUSH_ADDR), tmp1					;\
800 	set	MMU_PCONTEXT, tmp2					;\
801 	ldxa	[tmp2]ASI_DMMU, tmp3					;\
802 	stxa	tmp3, [tmp2]ASI_DMMU					;\
803 	flush	tmp1	/* See Cheetah PRM 8.10.2 */
804 
805 /*
806  * Macro that flushes the entire Dcache.
807  *
808  * arg1 = dcache size
809  * arg2 = dcache linesize
810  */
811 #define	CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)				\
812 	sub	arg1, arg2, tmp1;					\
813 1:									\
814 	stxa	%g0, [tmp1]ASI_DC_TAG;					\
815 	membar	#Sync;							\
816 	cmp	%g0, tmp1;						\
817 	bne,pt	%icc, 1b;						\
818 	  sub	tmp1, arg2, tmp1;
819 
820 /*
821  * Macro that flushes the entire Icache.
822  *
823  * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
824  * because accesses to ASI 0x67 interfere with Icache coherency.  We
825  * must make sure the Icache is off, then turn it back on after the entire
826  * cache has been invalidated.  If the Icache is originally off, we'll just
827  * clear the tags but not turn the Icache on.
828  *
829  * arg1 = icache size
830  * arg2 = icache linesize
831  */
832 #define	CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)			\
833 	ldxa	[%g0]ASI_DCU, tmp2;					\
834 	andn	tmp2, DCU_IC, tmp1;					\
835 	stxa	tmp1, [%g0]ASI_DCU;					\
836 	flush	%g0;	/* flush required after changing the IC bit */	\
837 	sllx	arg2, 1, arg2;		/* arg2 = linesize * 2 */	\
838 	sllx	arg1, 1, arg1;		/* arg1 = size * 2 */		\
839 	sub	arg1, arg2, arg1;					\
840 	or	arg1, CH_ICTAG_LOWER, arg1;	/* "write" tag */	\
841 1:									\
842 	stxa	%g0, [arg1]ASI_IC_TAG;					\
843 	membar	#Sync;				/* Cheetah PRM 8.9.3 */	\
844 	cmp	arg1, CH_ICTAG_LOWER;					\
845 	bne,pt	%icc, 1b;						\
846 	  sub	arg1, arg2, arg1;					\
847 	stxa	tmp2, [%g0]ASI_DCU;					\
848 	flush	%g0;	/* flush required after changing the IC bit */
849 
850 
851 #if defined(JALAPENO) || defined(SERRANO)
852 
853 /*
854  * ASI access to the L2 tag or L2 flush can hang the cpu when interacting
855  * with combinations of L2 snoops, victims and stores.
856  *
857  * A possible workaround is to surround each L2 ASI access with membars
858  * and make sure that the code is hitting in the Icache.  This requires
859  * aligning code sequence at E$ boundary and forcing I$ fetch by
860  * jumping to selected offsets so that we don't take any I$ misses
861  * during ASI access to the L2 tag or L2 flush.  This also requires
862  * making sure that we don't take any interrupts or traps (such as
863  * fast ECC trap, I$/D$ tag parity error) which can result in eviction
864  * of this code sequence from I$, thus causing a miss.
865  *
866  * Because of the complexity/risk, we have decided to do a partial fix
867  * of adding membar around each ASI access to the L2 tag or L2 flush.
868  */
869 
870 #define	JP_EC_DIAG_ACCESS_MEMBAR	\
871 	membar	#Sync
872 
873 /*
874  * Jalapeno version of macro that flushes the entire Ecache.
875  *
876  * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
877  *
878  * arg1 = ecache size
879  * arg2 = ecache linesize - not modified; can be an immediate constant.
880  */
881 #define	ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)	\
882 	CPU_INDEX(tmp1, tmp2);						\
883 	set	JP_ECACHE_IDX_DISP_FLUSH, tmp2;				\
884 	sllx	tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;			\
885 	or	tmp1, tmp2, tmp1;					\
886 	srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;			\
887 1:									\
888 	subcc	tmp2, arg2, tmp2;					\
889 	JP_EC_DIAG_ACCESS_MEMBAR;					\
890 	ldxa	[tmp1 + tmp2]ASI_EC_DIAG, %g0;				\
891 	JP_EC_DIAG_ACCESS_MEMBAR;					\
892 	bg,pt	%xcc, 1b;						\
893 	  nop;								\
894 	mov	1, tmp2;						\
895 	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
896 	add	tmp1, tmp2, tmp1;					\
897 	mov	(JP_ECACHE_NWAY-1), tmp2;				\
898 	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
899 	andcc	tmp1, tmp2, tmp2;					\
900 	bnz,pt	%xcc, 1b;						\
901 	  srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2
902 
903 #else	/* JALAPENO || SERRANO */
904 
905 /*
906  * Cheetah version of macro that flushes the entire Ecache.
907  *
908  *  Need to displacement flush 2x ecache size from Ecache flush area.
909  *
910  * arg1 = ecache size
911  * arg2 = ecache linesize
912  * arg3 = ecache flush address - for cheetah only
913  */
914 #define	CH_ECACHE_FLUSHALL(arg1, arg2, arg3)				\
915 	sllx	arg1, 1, arg1;						\
916 1:									\
917 	subcc	arg1, arg2, arg1;					\
918 	bg,pt	%xcc, 1b;						\
919 	  ldxa	[arg1 + arg3]ASI_MEM, %g0;
920 
921 /*
922  * Cheetah+ version of macro that flushes the entire Ecache.
923  *
924  * Uses the displacement flush feature.
925  *
926  * arg1 = ecache size
927  * arg2 = ecache linesize
928  * impl = CPU implementation as returned from GET_CPU_IMPL()
929  *        The value in this register is destroyed during execution
930  *        of the macro.
931  */
932 #if defined(CHEETAH_PLUS)
933 #define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)				\
934 	cmp	impl, PANTHER_IMPL;					\
935 	bne	%xcc, 1f;						\
936 	  nop;								\
937 	set	PN_L3_IDX_DISP_FLUSH, impl;				\
938 	b	2f;							\
939 	  nop;								\
940 1:									\
941 	set	CHP_ECACHE_IDX_DISP_FLUSH, impl;			\
942 2:									\
943 	subcc	arg1, arg2, arg1;					\
944 	bg,pt	%xcc, 2b;						\
945 	  ldxa	[arg1 + impl]ASI_EC_DIAG, %g0;
946 #else	/* CHEETAH_PLUS */
947 #define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
948 #endif	/* CHEETAH_PLUS */
949 
950 /*
951  * Macro that flushes the entire Ecache.
952  *
953  * arg1 = ecache size
954  * arg2 = ecache linesize
955  * arg3 = ecache flush address - for cheetah only
956  */
957 #define	ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)				\
958 	GET_CPU_IMPL(tmp1);						\
959 	cmp	tmp1, CHEETAH_IMPL;					\
960 	bne	%xcc, 2f;						\
961 	  nop;								\
962 	CH_ECACHE_FLUSHALL(arg1, arg2, arg3);				\
963 	ba	3f;							\
964 	  nop;								\
965 2:									\
966 	CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);				\
967 3:
968 
969 #endif	/* JALAPENO || SERRANO */
970 
971 /*
972  * Macro that flushes the Panther L2 cache.
973  */
974 #if defined(CHEETAH_PLUS)
975 #define	PN_L2_FLUSHALL(scr1, scr2, scr3)				\
976 	GET_CPU_IMPL(scr3);						\
977 	cmp	scr3, PANTHER_IMPL;					\
978 	bne	%xcc, 2f;						\
979 	  nop;								\
980 	set	PN_L2_SIZE, scr1;					\
981 	set	PN_L2_LINESIZE, scr2;					\
982 	set	PN_L2_IDX_DISP_FLUSH, scr3;				\
983 1:									\
984 	subcc	scr1, scr2, scr1;					\
985 	bg,pt	%xcc, 1b;						\
986 	  ldxa	[scr1 + scr3]ASI_L2_TAG, %g0;				\
987 2:
988 #else	/* CHEETAH_PLUS */
989 #define	PN_L2_FLUSHALL(scr1, scr2, scr3)
990 #endif	/* CHEETAH_PLUS */
991 
992 /*
993  * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
994  * this macro returns the TLB index for that mapping based on a 512 entry
995  * (2-way set associative) TLB. Aaside from the 16 entry fully associative
996  * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
997  *
998  * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
999  * mask out all but the lower 8 bits because:
1000  *
1001  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
1002  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
1003  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
1004  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
1005  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
1006  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
1007  *
1008  * and
1009  *
1010  *    array index for   8K pages = VA[20:13]
1011  *    array index for  64K pages = VA[23:16]
1012  *    array index for 512K pages = VA[26:19]
1013  *    array index for   4M pages = VA[29:22]
1014  *    array index for  32M pages = VA[32:25]
1015  *    array index for 256M pages = VA[35:28]
1016  *
1017  * Inputs:
1018  *
1019  *    va	- Register.
1020  *		  Input: Virtual address in which we are interested.
1021  *		  Output: TLB index value.
1022  *    pg_sz	- Register. Page Size of the TLB in question as encoded
1023  *		  in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
1024  */
1025 #if defined(CHEETAH_PLUS)
1026 #define	PN_GET_TLB_INDEX(va, pg_sz)					\
1027 	srlx	va, 13, va;	/* first shift the 13 bits and then */	\
1028 	srlx	va, pg_sz, va;	/* shift by pg_sz three times. */	\
1029 	srlx	va, pg_sz, va;						\
1030 	srlx	va, pg_sz, va;						\
1031 	and	va, 0xff, va;	/* mask out all but the lower 8 bits */
1032 #endif	/* CHEETAH_PLUS */
1033 
1034 /*
1035  * The following macros are for error traps at TL>0.
1036  * The issue with error traps at TL>0 is that there are no safely
1037  * available global registers.  So we use the trick of generating a
1038  * software trap, then using the %tpc, %tnpc and %tstate registers to
1039  * temporarily save the values of %g1 and %g2.
1040  */
1041 
1042 /*
1043  * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
1044  * Does the following steps:
1045  *	1. membar #Sync - required for USIII family errors.
1046  *	2. Specified software trap.
1047  * NB: Must be 8 instructions or less to fit in trap table and code must
1048  *     be relocatable.
1049  */
1050 #define	CH_ERR_TL1_TRAPENTRY(trapno)		\
1051 	membar	#Sync;				\
1052 	ta	trapno;				\
1053 	nop; nop; nop; nop; nop; nop
1054 
1055 /*
1056  * Macro to generate 8-instruction trap table entry for TL>0 software trap.
1057  * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
1058  * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
1059  * we need to put the low-order two bits of %g1 and %g2 in %tstate).
1060  * Note that %tstate has a reserved hole from bits 3-7, so we put the
1061  * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
1062  * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
1063  * state bits).  Note that we must do a jmp instruction, since this
1064  * is moved into the trap table entry.
1065  * NB: Must be 8 instructions or less to fit in trap table and code must
1066  *     be relocatable.
1067  */
1068 #define	CH_ERR_TL1_SWTRAPENTRY(label)		\
1069 	wrpr	%g1, %tpc;			\
1070 	and	%g1, 3, %g1;			\
1071 	wrpr	%g2, %tnpc;			\
1072 	sllx	%g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
1073 	or	%g1, %g2, %g2;			\
1074 	sethi	%hi(label), %g1;		\
1075 	jmp	%g1+%lo(label);			\
1076 	  wrpr	%g2, %tstate
1077 
1078 /*
1079  * Macro to get ptr to ch_err_tl1_data.
1080  * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
1081  * will point to a kernel nucleus virtual address with ASI_N in %asi.
1082  * This allows us to:
1083  *   1. Avoid getting MMU misses.  We may have gotten the original
1084  *	Fast ECC error in an MMU handler and if we get an MMU trap
1085  *	in the TL>0 handlers, we'll scribble on the MMU regs.
1086  *   2. Allows us to use the same code in the TL>0 handlers whether
1087  *	we're accessing kernel nucleus virtual addresses or physical
1088  *	addresses.
1089  * pseudo-code:
1090  *	reg1 <- ch_err_tl1_paddrs[CPUID];
1091  *	if (reg1 == NULL) {
1092  *		reg1 <- &ch_err_tl1_data
1093  *		%asi <- ASI_N
1094  *	} else {
1095  *		reg1 <- reg1 + offset +
1096  *		    sizeof (ch_err_tl1_data) * (%tl - 3)
1097  *		%asi <- ASI_MEM
1098  *	}
1099  */
1100 #define	GET_CH_ERR_TL1_PTR(reg1, reg2, offset)	\
1101 	CPU_INDEX(reg1, reg2);			\
1102 	sllx	reg1, 3, reg1;			\
1103 	set	ch_err_tl1_paddrs, reg2;	\
1104 	ldx	[reg1+reg2], reg1;		\
1105 	brnz	reg1, 1f;			\
1106 	add	reg1, offset, reg1;		\
1107 	set	ch_err_tl1_data, reg1;		\
1108 	ba	2f;				\
1109 	wr	%g0, ASI_N, %asi;		\
1110 1:	rdpr	%tl, reg2;			\
1111 	sub	reg2, 3, reg2;			\
1112 	mulx	reg2, CH_ERR_TL1_DATA_SIZE, reg2;	\
1113 	add	reg1, reg2, reg1;		\
1114 	wr	%g0, ASI_MEM, %asi;		\
1115 2:
1116 
1117 /*
1118  * Macro to generate entry code for TL>0 error handlers.
1119  * At the end of this macro, %g1 will point to the ch_err_tl1_data
1120  * structure and %g2 will have the original flags in the ch_err_tl1_data
1121  * structure and %g5 will have the value of %tstate where the Fast ECC
1122  * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
1123  * All %g registers except for %g1, %g2 and %g5 will be available after
1124  * this macro.
1125  * Does the following steps:
1126  *   1. Compute physical address of per-cpu/per-tl save area using
1127  *	only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
1128  *	leaving address in %g1 and updating the %asi register.
1129  *	If there is no data area available, we branch to label.
1130  *   2. Save %g3-%g7 in save area.
1131  *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
1132  *	original %g1+%g2 values (because we're going to change %tl).
1133  *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
1134  *	running at %tl+1 as small as possible.
1135  *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
1136  *	%tstate (%g5) and save in save area, carefully preserving %g5
1137  *	because it has the CH_ERR_TSTATE_DC_ON value.
1138  *   6. Load existing ch_err_tl1_data flags in %g2
1139  *   7. Compute the new flags
1140  *   8. If %g2 is non-zero (the structure was busy), shift the new
1141  *	flags by CH_ERR_ME_SHIFT and or them with the old flags.
1142  *   9. Store the updated flags into ch_err_tl1_data flags.
1143  *   10. If %g2 is non-zero, read the %tpc and store it in
1144  *	ch_err_tl1_data.
1145  */
1146 #define	CH_ERR_TL1_ENTER(flags)			\
1147 	GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);	\
1148 	stxa	%g3, [%g1 + CH_ERR_TL1_G3]%asi;	\
1149 	stxa	%g4, [%g1 + CH_ERR_TL1_G4]%asi;	\
1150 	stxa	%g5, [%g1 + CH_ERR_TL1_G5]%asi;	\
1151 	stxa	%g6, [%g1 + CH_ERR_TL1_G6]%asi;	\
1152 	stxa	%g7, [%g1 + CH_ERR_TL1_G7]%asi;	\
1153 	rdpr	%tpc, %g3;			\
1154 	rdpr	%tnpc, %g4;			\
1155 	rdpr	%tstate, %g5;			\
1156 	rdpr	%tl, %g6;			\
1157 	sub	%g6, 1, %g6;			\
1158 	wrpr	%g6, %tl;			\
1159 	and	%g5, 3, %g6;			\
1160 	andn	%g3, 3, %g3;			\
1161 	or	%g3, %g6, %g3;			\
1162 	stxa	%g3, [%g1 + CH_ERR_TL1_G1]%asi;	\
1163 	srlx	%g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6;	\
1164 	and	%g6, 3, %g6;			\
1165 	andn	%g4, 3, %g4;			\
1166 	or	%g6, %g4, %g4;			\
1167 	stxa	%g4, [%g1 + CH_ERR_TL1_G2]%asi;	\
1168 	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1169 	set	flags | CH_ERR_TL, %g3;		\
1170 	brz	%g2, 9f;			\
1171 	sllx	%g3, CH_ERR_ME_SHIFT, %g4;	\
1172 	or	%g2, %g4, %g3;			\
1173 9:	stxa	%g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1174 	brnz	%g2, 8f;			\
1175 	rdpr	%tpc, %g4;			\
1176 	stxa	%g4, [%g1 + CH_ERR_TL1_TPC]%asi;	\
1177 8:
1178 
1179 /*
1180  * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
1181  * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
1182  * at TL>0 handlers because the D$ may have corrupted data and we need to
1183  * turn off the I$ to allow for diagnostic accesses.  We then invoke
1184  * the normal entry macro and after it is done we save the values of
1185  * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
1186  * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
1187  */
1188 #define	CH_ERR_TL1_FECC_ENTER			\
1189 	ldxa	[%g0]ASI_DCU, %g1;		\
1190 	andn	%g1, DCU_DC + DCU_IC, %g2;	\
1191 	stxa	%g2, [%g0]ASI_DCU;		\
1192 	flush	%g0;	/* DCU_IC need flush */	\
1193 	rdpr	%tstate, %g2;			\
1194 	and	%g1, DCU_DC + DCU_IC, %g1;	\
1195 	sllx	%g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;	\
1196 	or	%g1, %g2, %g2;			\
1197 	wrpr	%g2, %tstate;			\
1198 	CH_ERR_TL1_ENTER(CH_ERR_FECC);		\
1199 	and	%g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;	\
1200 	stxa	%g5, [%g1 + CH_ERR_TL1_TMP]%asi
1201 
1202 /*
1203  * Macro to generate exit code for TL>0 error handlers.
1204  * We fall into this macro if we've successfully logged the error in
1205  * the ch_err_tl1_data structure and want the PIL15 softint to pick
1206  * it up and log it.
1207  * Does the following steps:
1208  *   1.	Set pending flag for this cpu in ch_err_tl1_pending.
1209  *   2.	Write %set_softint with (1<<pil) to cause a pil level trap
1210  *   3.	Restore registers from ch_err_tl1_data, which is pointed to
1211  *	by %g1, last register to restore is %g1 since it's pointing
1212  *	to the save area.
1213  *   4. Execute retry
1214  */
1215 #define	CH_ERR_TL1_EXIT				\
1216 	CPU_INDEX(%g2, %g3);			\
1217 	set	ch_err_tl1_pending, %g3;	\
1218 	set	-1, %g4;			\
1219 	stb	%g4, [%g2 + %g3];		\
1220 	mov	1, %g2;				\
1221 	sll	%g2, PIL_15, %g2;		\
1222 	wr	%g2, SET_SOFTINT;		\
1223 	ldxa	[%g1 + CH_ERR_TL1_G7]%asi, %g7;	\
1224 	ldxa	[%g1 + CH_ERR_TL1_G6]%asi, %g6;	\
1225 	ldxa	[%g1 + CH_ERR_TL1_G5]%asi, %g5;	\
1226 	ldxa	[%g1 + CH_ERR_TL1_G4]%asi, %g4;	\
1227 	ldxa	[%g1 + CH_ERR_TL1_G3]%asi, %g3;	\
1228 	ldxa	[%g1 + CH_ERR_TL1_G2]%asi, %g2;	\
1229 	ldxa	[%g1 + CH_ERR_TL1_G1]%asi, %g1;	\
1230 	retry
1231 
1232 /*
1233  * Generates unrecoverable error label for TL>0 handlers.
1234  * At label (Unrecoverable error routine)
1235  *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
1236  *	argument to cpu_tl1_err_panic).
1237  *   2.	Call cpu_tl1_err_panic via systrap at PIL 15
1238  */
1239 #define	CH_ERR_TL1_PANIC_EXIT(label)		\
1240 label:	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1241 	or	%g2, CH_ERR_TL | CH_ERR_PANIC, %g2;	\
1242 	stxa	%g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1243 	set	cpu_tl1_err_panic, %g1;		\
1244 	ba	sys_trap;			\
1245 	  mov	PIL_15, %g4
1246 
1247 
1248 
1249 /* END CSTYLED */
1250 #endif	/* _ASM */
1251 
1252 #ifdef	__cplusplus
1253 }
1254 #endif
1255 
1256 #endif /* _CHEETAHASM_H */
1257