xref: /titanic_50/usr/src/uts/sun4u/sys/cheetahasm.h (revision ed05dc578bfae88f2ec13e7cc6bb5d91bc1c3bd1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef	_CHEETAHASM_H
27 #define	_CHEETAHASM_H
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #ifdef	__cplusplus
32 extern "C" {
33 #endif
34 
35 #ifdef _ASM
36 /* BEGIN CSTYLED */
37 
38 #define	ASM_LD(reg, symbol)						\
39 	sethi	%hi(symbol), reg;					\
40 	ld	[reg + %lo(symbol)], reg;				\
41 
42 #define	ASM_LDX(reg, symbol)						\
43 	sethi	%hi(symbol), reg;					\
44 	ldx	[reg + %lo(symbol)], reg;				\
45 
46 #define	ASM_JMP(reg, symbol)						\
47 	sethi	%hi(symbol), reg;					\
48 	jmp	reg + %lo(symbol);					\
49 	nop
50 
51 /*
52  * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
53  * ptr is in the machcpu structure.
54  *  off_reg:  Register offset from 'cpu_private' ptr.
55  *  scr1:    Scratch, ptr is returned in this register.
56  *  scr2:    Scratch
57  *  label:   Label to branch to if cpu_private ptr is null/zero.
58  */
59 #define	GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)			\
60 	CPU_ADDR(scr1, scr2);						\
61 	ldn	[scr1 + CPU_PRIVATE], scr1;				\
62 	cmp	scr1, 0;						\
63 	be	label;							\
64 	  nop;								\
65 	add	scr1, off_reg, scr1
66 
67 /*
68  * Macro version of get_dcache_dtag.  We use this macro in the
69  * CPU logout code. Since the Dcache is virtually indexed, only
70  * bits [12:5] of the AFAR can be used so we need to search through
71  * 8 indexes (4 ways + bit 13) in order to find the tag we want.
72  *   afar:  input AFAR, not modified.
73  *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
74  *   scr1:  scratch.
75  *   scr2:  scratch, will hold tag to look for.
76  *   scr3:  used for Dcache index, loops through 4 ways.
77  */
78 #define	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
79 	set	CH_DCACHE_IDX_MASK, scr3;				\
80 	and	afar, scr3, scr3;					\
81 	srlx	afar, CH_DCTAG_PA_SHIFT, scr2;				\
82 	b	1f;							\
83 	  or	scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */	\
84 	.align	128;							\
85 1:									\
86 	ldxa	[scr3]ASI_DC_TAG, scr1;		/* read tag */		\
87 	cmp	scr1, scr2;						\
88 	bne	4f;				/* not found? */	\
89 	  nop;								\
90 	stxa	scr3, [datap + CH_DC_IDX]%asi;	/* store index */	\
91 	stxa	scr1, [datap + CH_DC_TAG]%asi;	/* store tag */		\
92 	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
93 	ldxa	[scr3]ASI_DC_UTAG, scr1;	/* read utag */		\
94 	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
95 	stxa	scr1, [datap + CH_DC_UTAG]%asi;				\
96 	ldxa	[scr3]ASI_DC_SNP_TAG, scr1;	/* read snoop tag */	\
97 	stxa	scr1, [datap + CH_DC_SNTAG]%asi;			\
98 	add	datap, CH_DC_DATA, datap;				\
99 	clr	scr2;							\
100 2:									\
101 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
102 	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read data */		\
103 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
104 	stxa	scr1, [datap]%asi;					\
105 	add	datap, 8, datap;					\
106 	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
107 	blt	2b;							\
108 	  add	scr2, 8, scr2;						\
109 									\
110 	GET_CPU_IMPL(scr2);	/* Parity bits are elsewhere for */	\
111 	cmp	scr2, PANTHER_IMPL;	/* panther processors. */	\
112 	bne,a	5f;			/* Done if not panther. */	\
113 	  add	datap, 8, datap; /* Skip to the end of the struct. */	\
114 	clr	scr2;							\
115 	add	datap, 7, datap; /* offset of the last parity byte */	\
116 	mov	1, scr1;						\
117 	sll	scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;		\
118 	or	scr3, scr1, scr3; /* add DC_data_parity bit to index */	\
119 3:									\
120 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
121 	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read parity bits */	\
122 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
123 	stba	scr1, [datap]%asi;					\
124 	dec	datap;							\
125 	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
126 	blt	3b;							\
127 	  add	scr2, 8, scr2;						\
128 	b	5f;							\
129 	  add	datap, 5, datap; /* set pointer to end of our struct */	\
130 4:									\
131 	set	CH_DCACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
132 	add	scr3, scr1, scr3;					\
133 	set	CH_DCACHE_IDX_LIMIT, scr1;	/* done? */		\
134 	cmp	scr3, scr1;						\
135 	blt	1b;							\
136 	  nop;								\
137 	add	datap, CH_DC_DATA_SIZE, datap;				\
138 5:
139 
140 /*
141  * Macro version of get_icache_dtag.  We use this macro in the CPU
142  * logout code. If the Icache is on, we don't want to capture the data.
143  *   afar:  input AFAR, not modified.
144  *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
145  *   scr1:  scratch.
146  *   scr2:  scratch, will hold tag to look for.
147  *   scr3:  used for Icache index, loops through 4 ways.
148  * Note: For Panther, the Icache is virtually indexed and increases in
149  * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
150  * of 32). This means the IC_addr index bits[14:7] for Panther now
151  * correspond to VA bits[13:6]. But since it is virtually indexed, we
152  * still mask out only bits[12:5] from the AFAR (we have to manually
153  * check bit 13). In order to make this code work for all processors,
154  * we end up checking twice as many indexes (8 instead of 4) as required
155  * for non-Panther CPUs and saving off twice as much data (16 instructions
156  * instead of just 8).
157  */
158 #define	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
159 	ldxa	[%g0]ASI_DCU, scr1;					\
160 	btst	DCU_IC, scr1;		/* is Icache enabled? */	\
161 	bne,a	6f;			/* yes, don't capture */	\
162 	  add	datap, CH_IC_DATA_SIZE, datap;	/* anul if no branch */	\
163 	GET_CPU_IMPL(scr2);	/* Panther only uses VA[13:6] */	\
164 	cmp	scr2, PANTHER_IMPL;	/* and we also want to mask */	\
165 	be	1f;			/* out bit 13 since the */	\
166 	  nop;				/* Panther I$ is VIPT. */	\
167 	set	CH_ICACHE_IDX_MASK, scr3;				\
168 	b	2f;							\
169 	  nop;								\
170 1:									\
171 	set	PN_ICACHE_VA_IDX_MASK, scr3;				\
172 2:									\
173 	and	afar, scr3, scr3;					\
174 	sllx	scr3, CH_ICACHE_IDX_SHIFT, scr3;			\
175 	srlx	afar, CH_ICPATAG_SHIFT, scr2;	/* pa tag we want */	\
176 	andn	scr2, CH_ICPATAG_LBITS, scr2;	/* mask off lower */	\
177 	b	3f;							\
178 	  nop;								\
179 	.align	128;							\
180 3:									\
181 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read pa tag */	\
182 	andn	scr1, CH_ICPATAG_LBITS, scr1;	/* mask off lower */	\
183 	cmp	scr1, scr2;						\
184 	bne	5f;				/* not found? */	\
185 	  nop;								\
186 	stxa	scr3, [datap + CH_IC_IDX]%asi;	/* store index */	\
187 	stxa	scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */	\
188 	add	scr3, CH_ICTAG_UTAG, scr3;	/* read utag */		\
189 	ldxa	[scr3]ASI_IC_TAG, scr1;					\
190 	add	scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;		\
191 	stxa	scr1, [datap + CH_IC_UTAG]%asi;				\
192 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read upper tag */	\
193 	add	scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;		\
194 	stxa	scr1, [datap + CH_IC_UPPER]%asi;			\
195 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read lower tag */	\
196 	andn	scr3, CH_ICTAG_TMASK, scr3;				\
197 	stxa	scr1, [datap + CH_IC_LOWER]%asi;			\
198 	ldxa	[scr3]ASI_IC_SNP_TAG, scr1;	/* read snoop tag */	\
199 	stxa	scr1, [datap + CH_IC_SNTAG]%asi;			\
200 	add	datap, CH_IC_DATA, datap;				\
201 	clr	scr2;							\
202 4:									\
203 	ldxa	[scr3 + scr2]ASI_IC_DATA, scr1;	/* read ins. data */	\
204 	stxa	scr1, [datap]%asi;					\
205 	add	datap, 8, datap;					\
206 	cmp	scr2, PN_IC_DATA_REG_SIZE - 8;				\
207 	blt	4b;							\
208 	  add	scr2, 8, scr2;						\
209 	b	6f;							\
210 	  nop;								\
211 5:									\
212 	set	CH_ICACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
213 	add	scr3, scr1, scr3;					\
214 	set	PN_ICACHE_IDX_LIMIT, scr1;	/* done? */		\
215 	cmp	scr3, scr1;						\
216 	blt	3b;							\
217 	  nop;								\
218 	add	datap, CH_IC_DATA_SIZE, datap;				\
219 6:
220 
221 #if defined(JALAPENO) || defined(SERRANO)
222 /*
223  * Macro version of get_ecache_dtag.  We use this macro in the
224  * CPU logout code.
225  *   afar:	input AFAR, not modified
226  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
227  *   ec_way:	Constant value (way number)
228  *   scr1:      Scratch
229  *   scr2:	Scratch.
230  *   scr3:	Scratch.
231  */
232 #define	GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)		\
233 	mov	ec_way, scr1;						\
234 	and	scr1, JP_ECACHE_NWAY - 1, scr1;	/* mask E$ way bits */	\
235 	sllx	scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;			\
236 	set	((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;	\
237 	and	afar, scr2, scr3;		/* get set offset */	\
238 	andn	scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */	\
239 	or	scr3, scr1, scr3;		/* or WAY bits */	\
240 	b	1f;							\
241 	  stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
242 	.align	64;							\
243 1:									\
244 	JP_EC_DIAG_ACCESS_MEMBAR;					\
245 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
246 	JP_EC_DIAG_ACCESS_MEMBAR;					\
247 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
248 	add	datap, CH_EC_DATA, datap;				\
249 2:									\
250 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
251 	clr	scr1;							\
252 3:						/* loop thru 5 regs */	\
253 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
254 	stxa	scr2, [datap]%asi;					\
255 	add	datap, 8, datap;					\
256 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
257 	bne	3b;							\
258 	   add	scr1, 8, scr1;						\
259 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
260 	beq	2b;							\
261 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
262 
263 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
264 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
265 	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
266 	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
267 	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
268 	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
269 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
270 
271 /*
272  * Jalapeno does not have cores so these macros are null.
273  */
274 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
275 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
276 
277 #if defined(JALAPENO)
278 /*
279  * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
280  * the fatal error bits are cleared.
281  *	datap:		pointer to cpu logout structure.
282  *	afar:		returned primary AFAR value.
283  *	scr1:		scratch
284  *	scr2:		scratch
285  */
286 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
287 	ldxa	[%g0]ASI_AFAR, afar;					\
288 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
289 	ldxa	[%g0]ASI_AFSR, scr2;					\
290 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
291 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
292 	sllx	scr1, 32, scr1;						\
293 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
294 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
295 	membar	#Sync
296 
297 /*
298  * Jalapeno has no shadow AFAR, null operation.
299  */
300 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
301 
302 #elif defined(SERRANO)
303 /*
304  * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
305  * the fatal error bits are cleared.  For Serrano, we also save the
306  * AFAR2 register.
307  *	datap:	pointer to cpu logout structure.
308  *	afar:	returned primary AFAR value.
309  *	scr1:	scratch
310  *	scr2:	scratch
311  */
312 #define GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
313 	set	ASI_MCU_AFAR2_VA, scr1;					\
314 	ldxa	[scr1]ASI_MCU_CTRL, afar;				\
315 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;	\
316 	ldxa	[%g0]ASI_AFAR, afar;					\
317 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
318 	ldxa	[%g0]ASI_AFSR, scr2;					\
319 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
320 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
321 	sllx	scr1, 32, scr1;						\
322 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
323 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ 	\
324 	membar	#Sync
325 
326 /*
327  * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
328  *      afar:   scratch, holds afar2.
329  *      datap:  pointer to cpu logout structure
330  *      scr1:   scratch
331  *      scr2:   scratch
332  *      scr3:   scratch
333  */
334 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
335 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;	\
336 	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;		\
337 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
338 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
339 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
340 	sub	datap, CH_CPU_LOGOUT_SIZE, datap
341 #endif /* SERRANO */
342 
343 #elif defined(CHEETAH_PLUS)
344 /*
345  * Macro version of get_ecache_dtag.  We use this macro in the
346  * CPU logout code.
347  *   afar:	input AFAR, not modified.
348  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
349  *   pn_way:	ecache way for panther (value = 0-3). For non-panther
350  *		cpus, this macro will be called with pn_way = 0.
351  *   scr1:	Scratch.
352  *   scr2:	Scratch.
353  *   scr3:	Scratch.
354  */
355 #define	GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)		\
356 	mov	afar, scr3;						\
357 	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
358 	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
359 	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
360 	mov	pn_way, scr1;	/* panther L3$ is 4-way so we ...    */	\
361 	sllx	scr1, PN_L3_WAY_SHIFT, scr1;	/* need to mask...   */	\
362 	or	scr3, scr1, scr3;	/* in the way bits <24:23>.  */	\
363 	b	1f;							\
364 	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
365 	.align	64;							\
366 1:									\
367 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
368 	stxa     scr1, [datap + CH_EC_TAG]%asi;				\
369 	set	CHP_ECACHE_IDX_TAG_ECC, scr1;				\
370 	or	scr3, scr1, scr1;					\
371 	ldxa    [scr1]ASI_EC_DIAG, scr1;	/* get E$ tag ECC */	\
372 	stxa	scr1, [datap + CH_EC_TAG_ECC]%asi;			\
373 	add	datap, CH_EC_DATA, datap;				\
374 2:									\
375 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
376 	clr	scr1;							\
377 3:						/* loop thru 5 regs */	\
378 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
379 	stxa	scr2, [datap]%asi;					\
380 	add	datap, 8, datap;					\
381 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
382 	bne	3b;							\
383 	   add	scr1, 8, scr1;						\
384 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
385 	beq	2b;							\
386 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
387 
388 /*
389  * If this is a panther, we need to make sure the sibling core is
390  * parked so that we avoid any race conditions during diagnostic
391  * accesses to the shared L2 and L3 caches.
392  * dcucr_reg:	This register will be used to keep track of whether
393  *		or not we need to unpark the core later.
394  *		It just so happens that we also use this same register
395  *		to keep track of our saved DCUCR value so we only touch
396  *		bit 4 of the register (which is a "reserved" bit in the
397  *		DCUCR) for keeping track of core parking.
398  * scr1:	Scratch register.
399  * scr2:	Scratch register.
400  */
401 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
402 	GET_CPU_IMPL(scr1);						\
403 	cmp	scr1, PANTHER_IMPL;	/* only park for panthers */	\
404 	bne,a	%xcc, 2f;						\
405 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
406 	set	ASI_CORE_RUNNING_STATUS, scr1;	/* check other core */	\
407 	ldxa	[scr1]ASI_CMP_SHARED, scr2;	/* is it running?   */	\
408 	cmp	scr2, PN_BOTH_CORES_RUNNING;				\
409 	bne,a	%xcc, 2f;	/* if not running, we are done */	\
410 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
411 	or	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
412 	set	ASI_CORE_ID, scr1;					\
413 	ldxa	[scr1]ASI_CMP_PER_CORE, scr2;				\
414 	and	scr2, COREID_MASK, scr2;				\
415 	or	%g0, 1, scr1;		/* find out which core... */	\
416 	sll	scr1, scr2, scr2;	/* ... we need to park... */	\
417 1:									\
418 	set	ASI_CORE_RUNNING_RW, scr1;				\
419 	ldxa    [scr1]ASI_CMP_SHARED, scr1;	/* ...but are we? */	\
420 	btst    scr1, scr2;        /* check our own parked status */	\
421 	bz      %xcc, 1b;        /* if we are then go round again */	\
422 	nop;								\
423 	set	ASI_CORE_RUNNING_RW, scr1;	/* else proceed... */	\
424 	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ... and park it. */	\
425 	membar	#Sync;							\
426 	set	ASI_CORE_RUNNING_STATUS, scr1;	/* spin until... */	\
427 	ldxa	[scr1]ASI_CMP_SHARED, scr1;	/* ... the other...  */	\
428 	cmp	scr1, scr2;	/* ...core is parked according to... */	\
429 	bne,a	%xcc, 1b;	/* ...the core running status reg.  */	\
430 	  nop;								\
431 2:
432 
433 /*
434  * The core running this code will unpark its sibling core if the
435  * sibling core had been parked by the current core earlier in this
436  * trap handler.
437  * dcucr_reg:	This register is used to keep track of whether or not
438  *		we need to unpark our sibling core.
439  *		It just so happens that we also use this same register
440  *		to keep track of our saved DCUCR value so we only touch
441  *		bit 4 of the register (which is a "reserved" bit in the
442  *		DCUCR) for keeping track of core parking.
443  * scr1:	Scratch register.
444  * scr2:	Scratch register.
445  */
446 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
447 	btst	PN_PARKED_OTHER_CORE, dcucr_reg;			\
448 	bz,pt	%xcc, 1f;	/* if nothing to unpark, we are done */	\
449 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
450 	set	ASI_CORE_RUNNING_RW, scr1;				\
451 	set	PN_BOTH_CORES_RUNNING, scr2;	/* we want both...   */	\
452 	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ...cores running. */	\
453 	membar	#Sync;							\
454 1:
455 
456 /*
457  * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
458  * in the primary AFSR are cleared except the fatal error bits.  For Panther,
459  * we also have to read and clear the AFSR_EXT, again leaving the fatal
460  * error bits alone.
461  *	datap:		pointer to cpu logout structure.
462  *	afar:		returned primary AFAR value.
463  *	scr1:		scratch
464  *	scr2:		scratch
465  */
466 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
467 	set	ASI_SHADOW_REG_VA, scr1;				\
468 	ldxa	[scr1]ASI_AFAR, scr2;					\
469 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;	\
470 	ldxa	[scr1]ASI_AFSR, scr2;					\
471 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;	\
472 	ldxa	[%g0]ASI_AFAR, afar;					\
473 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
474 	ldxa	[%g0]ASI_AFSR, scr2;					\
475 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
476 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
477 	sllx	scr1, 32, scr1;						\
478 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */ 	\
479 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
480 	membar	#Sync;							\
481 	GET_CPU_IMPL(scr1);						\
482 	cmp	scr1, PANTHER_IMPL;					\
483 	bne	%xcc, 1f;						\
484 	   nop;								\
485 	set	ASI_SHADOW_AFSR_EXT_VA, scr1;	/* shadow AFSR_EXT */	\
486 	ldxa	[scr1]ASI_AFSR, scr2;					\
487 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
488 	set	ASI_AFSR_EXT_VA, scr1;		/* primary AFSR_EXT */	\
489 	ldxa	[scr1]ASI_AFSR, scr2;					\
490 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;	\
491 	set	C_AFSR_EXT_FATAL_ERRS, scr1;				\
492 	bclr	scr1, scr2;	/* Clear fatal error bits here, */	\
493 	set	ASI_AFSR_EXT_VA, scr1;	/* so they're left */		\
494 	stxa	scr2, [scr1]ASI_AFSR;	/* as is in AFSR_EXT */		\
495 	membar	#Sync;							\
496 1:
497 
498 /*
499  * This macro is used in the CPU logout code to capture diagnostic
500  * information from the L2 cache on panther processors.
501  *   afar:	input AFAR, not modified.
502  *   datap:	Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
503  *   scr1:	Scratch.
504  *   scr2:	Scratch.
505  *   scr3:	Scratch.
506  */
507 #define	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)		\
508 	mov	afar, scr3;						\
509 	set	PN_L2_INDEX_MASK, scr1;					\
510 	and	scr3, scr1, scr3;					\
511 	b	1f;	/* code to read tags and data should be ...  */	\
512 	   nop;		/* ...on the same cache line if possible.    */	\
513 	.align	128;	/* update this line if you add lines below. */	\
514 1:									\
515 	stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store L2$ index  */	\
516 	ldxa	[scr3]ASI_L2_TAG, scr1;		/* read the L2$ tag */	\
517 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
518 	add	datap, CH_EC_DATA, datap;				\
519 	clr	scr1;							\
520 2:									\
521 	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
522 	stxa	scr2, [datap]%asi;		/* <511:256> of L2  */	\
523 	add	datap, 8, datap;		/* data and record  */	\
524 	cmp	scr1, (PN_L2_LINESIZE / 2) - 8;	/* it in the cpu    */	\
525 	bne	2b;				/* logout struct.   */	\
526 	  add	scr1, 8, scr1;						\
527 	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
528 	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
529 	stxa	scr2, [datap]%asi;		/* ecc of <511:256> */	\
530 	add	datap, 8, datap;					\
531 3:									\
532 	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
533 	stxa	scr2, [datap]%asi;		/* <255:0> of L2    */	\
534 	add	datap, 8, datap;		/* data and record  */	\
535 	cmp	scr1, PN_L2_LINESIZE - 8;	/* it in the cpu    */	\
536 	bne	3b;				/* logout struct.   */	\
537 	  add	scr1, 8, scr1;						\
538 	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
539 	add	scr2, PN_L2_ECC_LO_REG, scr2;				\
540 	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
541 	stxa	scr2, [datap]%asi;		/* ecc of <255:0>.  */	\
542 	add	datap, 8, datap;		/* Advance pointer  */	\
543 	set	PN_L2_SET_SIZE, scr2;					\
544 	set	PN_L2_MAX_SET, scr1;					\
545 	cmp	scr1, scr3;	/* more ways to try for this line? */	\
546 	bg,a	%xcc, 1b;	/* if so, start over with next way */	\
547 	  add	scr3, scr2, scr3
548 
549 /*
550  * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
551  *	afar:	AFAR from access.
552  *	datap:	pointer to cpu logout structure.
553  *	scr1:	scratch
554  *	scr2:	scratch
555  *	scr3:	scratch
556  */
557 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
558 	GET_CPU_IMPL(scr1);						\
559 	cmp	scr1, PANTHER_IMPL;					\
560 	bne	%xcc, 4f;						\
561 	  nop;								\
562 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
563 	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
564 	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
565 	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
566 	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
567 	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
568 	b	5f;							\
569 	  nop;								\
570 4:									\
571 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
572 	GET_ECACHE_WAY_BIT(scr1, scr2);					\
573 	xor	afar, scr1, afar;					\
574 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
575 	GET_ECACHE_WAY_BIT(scr1, scr2);		/* restore AFAR */	\
576 	xor	afar, scr1, afar;					\
577 	add	datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap;	\
578 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
579 5:
580 
581 /*
582  * Cheetah+ needs to capture E$, D$ and I$ lines associated with
583  * shadow afar.
584  *	afar:	scratch, holds shadow afar.
585  *	datap:	pointer to cpu logout structure
586  *	scr1:	scratch
587  *	scr2:	scratch
588  *	scr3:	scratch
589  */
590 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
591 	ldxa	[datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;	\
592 	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;	\
593 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
594 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
595 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
596 	sub	datap, CH_CPU_LOGOUT_SIZE, datap
597 
598 /*
599  * Compute the "Way" bit for 2-way Ecache for Cheetah+.
600  */
601 #define	GET_ECACHE_WAY_BIT(scr1, scr2)					\
602 	CPU_INDEX(scr1, scr2);						\
603 	mulx	scr1, CPU_NODE_SIZE, scr1;				\
604 	add	scr1, ECACHE_SIZE, scr1;				\
605 	set	cpunodes, scr2;						\
606 	ld	[scr1 + scr2], scr1;					\
607 	srlx	scr1, 1, scr1
608 
609 #else /* CHEETAH_PLUS */
610 /*
611  * Macro version of get_ecache_dtag.  We use this macro in the
612  * CPU logout code.
613  *   afar:	input AFAR, not modified.
614  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
615  *   scr1:      Scratch.
616  *   scr2:	Scratch.
617  *   scr3:	Scratch.
618  */
619 #define	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
620 	mov	afar, scr3;						\
621 	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
622 	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
623 	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
624 	b	1f;							\
625 	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
626 	.align	64;							\
627 1:									\
628 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
629 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
630 	add	datap, CH_EC_DATA, datap;				\
631 2:									\
632 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
633 	clr	scr1;							\
634 3:						/* loop thru 5 regs */	\
635 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
636 	stxa	scr2, [datap]%asi;					\
637 	add	datap, 8, datap;					\
638 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
639 	bne	3b;							\
640 	   add	scr1, 8, scr1;						\
641 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
642 	beq	2b;							\
643 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
644 
645 /*
646  * Cheetah does not have cores so these macros are null.
647  */
648 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
649 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
650 
651 /*
652  * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
653  * fatal error bits.
654  *	datap:		pointer to cpu logout structure.
655  *	afar:		returned primary AFAR value.
656  *	scr1:		scratch
657  *	scr2:		scratch
658  */
659 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)	\
660 	ldxa	[%g0]ASI_AFAR, afar;					\
661 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
662 	ldxa	[%g0]ASI_AFSR, scr2;					\
663 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
664 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
665 	sllx	scr1, 32, scr1;						\
666 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
667 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
668 	membar	#Sync
669 
670 /*
671  * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
672  *	afar:	AFAR from access.
673  *	datap:	pointer to cpu logout structure.
674  *	scr1:	scratch
675  *	scr2:	scratch
676  *	scr3:	scratch
677  */
678 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
679 	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
680 	add	datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap;	\
681 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
682 
683 /*
684  * Cheetah has no shadow AFAR, null operation.
685  */
686 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
687 
688 #endif	/* CHEETAH_PLUS */
689 
690 /*
691  * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
692  * logout data at TL>0. r_val is a register that returns the "failure count"
693  * to the caller, and may be used as a scratch register until the end of
694  * the macro.  afar is used to return the primary AFAR value to the caller
695  * and it too can be used as a scratch register until the end. r_or_s is
696  * a reg or symbol that has the offset within the "cpu_private" data area
697  * to deposit the logout data.  t_flags is a register that has the
698  * trap-type/trap-level/CEEN info. This t_flags register may be used after
699  * the GET_AFSR_AFAR macro.
700  *
701  * The CPU logout operation will fail (r_val > 0) if the logout
702  * structure in question is already being used. Otherwise, the CPU
703  * logout operation will succeed (r_val = 0). For failures, r_val
704  * returns the busy count (# of times we tried using this CPU logout
705  * structure when it was busy.)
706  *
707  *   Register usage:
708  *	%asi:   Must be set to either ASI_MEM if the address in datap
709  *		is a physical address or to ASI_N if the address in
710  *		datap is a virtual address.
711  *	r_val:	This register is the return value which tells the
712  *		caller whether or not the LOGOUT operation was successful.
713  *		For failures, r_val returns the fail count (i.e. number of
714  *		times we have tried to use this logout structure when it was
715  *		already being used.
716  *	afar:	output: contains AFAR on exit
717  *	t_flags: input trap type info, may be used as scratch after stored
718  *		to cpu log out structure.
719  *	datap:	Points to log out data area.
720  *	scr1:	Scratch
721  *	scr2:	Scratch (may be r_val)
722  *	scr3:   Scratch (may be t_flags)
723  */
724 #define	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
725 	setx	LOGOUT_INVALID, scr2, scr1;				\
726 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;	\
727 	cmp	scr2, scr1;						\
728 	bne	8f;							\
729 	  nop;								\
730 	stxa	t_flags, [datap + CH_CLO_FLAGS]%asi;			\
731 	GET_AFSR_AFAR(datap, afar, scr1, scr2);				\
732 	add	datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;		\
733 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
734 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
735 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
736 	sub	datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;		\
737 	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);			\
738 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;	\
739 	set	0, r_val;	/* return value for success */		\
740 	ba	9f;							\
741 	  nop;								\
742 8:									\
743 	ldxa	[%g0]ASI_AFAR, afar;					\
744 	ldxa	[datap + CH_CLO_NEST_CNT]%asi, r_val;			\
745 	inc	r_val;		/* return value for failure */		\
746 	stxa	r_val, [datap + CH_CLO_NEST_CNT]%asi;			\
747 	membar	#Sync;							\
748 9:
749 
750 /*
751  * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
752  * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
753  * up the expected data pointer in the scr1 register and sets the %asi
754  * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
755  * is used at TL>0.
756  *
757  * The CPU logout operation will fail (r_val > 0) if the logout
758  * structure in question is already being used. Otherwise, the CPU
759  * logout operation will succeed (r_val = 0). For failures, r_val
760  * returns the busy count (# of times we tried using this CPU logout
761  * structure when it was busy.)
762  *
763  *   Register usage:
764  *	r_val:	This register is the return value which tells the
765  *		caller whether or not the LOGOUT operation was successful.
766  *		For failures, r_val returns the fail count (i.e. number of
767  *		times we have tried to use this logout structure when it was
768  *		already being used.
769  *	afar:	returns AFAR, used internally as afar value.
770  *		output: if the cpu_private struct has not been initialized,
771  *		        then we return the t_flags value listed below.
772  *	r_or_s:	input offset, either register or constant (symbol).  It's
773  *		OK for r_or_s to be a register as long as it's not scr1 or
774  *		scr3.
775  *	t_flags: input trap type info, may be used as scratch after stored
776  *		to cpu log out structure.
777  *	scr1:	Scratch, points to log out data area.
778  *	scr2:	Scratch (may be r_or_s)
779  *	scr3:	Scratch (may be r_val)
780  *	scr4:   Scratch (may be t_flags)
781  */
782 #define	DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
783 	GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
784 	wr	%g0, ASI_N, %asi;					\
785 	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4)	\
786 	ba	6f;							\
787 	  nop;								\
788 7:									\
789 	mov	t_flags, afar;		/* depends on afar = %g2  */	\
790 	set	0, r_val;		/* success in this case.  */	\
791 6:
792 
793 /*
794  * The P$ is flushed as a side effect of writing to the Primary
795  * or Secondary Context Register. After writing to a context
796  * register, every line of the P$ in the Valid state is invalidated,
797  * regardless of which context it belongs to.
798  * This routine simply touches the Primary context register by
799  * reading the current value and writing it back. The Primary
800  * context is not changed.
801  */
802 #define	PCACHE_FLUSHALL(tmp1, tmp2, tmp3)				\
803 	sethi	%hi(FLUSH_ADDR), tmp1					;\
804 	set	MMU_PCONTEXT, tmp2					;\
805 	ldxa	[tmp2]ASI_DMMU, tmp3					;\
806 	stxa	tmp3, [tmp2]ASI_DMMU					;\
807 	flush	tmp1	/* See Cheetah PRM 8.10.2 */
808 
809 /*
810  * Macro that flushes the entire Dcache.
811  *
812  * arg1 = dcache size
813  * arg2 = dcache linesize
814  */
815 #define	CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)				\
816 	sub	arg1, arg2, tmp1;					\
817 1:									\
818 	stxa	%g0, [tmp1]ASI_DC_TAG;					\
819 	membar	#Sync;							\
820 	cmp	%g0, tmp1;						\
821 	bne,pt	%icc, 1b;						\
822 	  sub	tmp1, arg2, tmp1;
823 
824 /*
825  * Macro that flushes the entire Icache.
826  *
827  * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
828  * because accesses to ASI 0x67 interfere with Icache coherency.  We
829  * must make sure the Icache is off, then turn it back on after the entire
830  * cache has been invalidated.  If the Icache is originally off, we'll just
831  * clear the tags but not turn the Icache on.
832  *
833  * arg1 = icache size
834  * arg2 = icache linesize
835  */
836 #define	CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)			\
837 	ldxa	[%g0]ASI_DCU, tmp2;					\
838 	andn	tmp2, DCU_IC, tmp1;					\
839 	stxa	tmp1, [%g0]ASI_DCU;					\
840 	flush	%g0;	/* flush required after changing the IC bit */	\
841 	sllx	arg2, 1, arg2;		/* arg2 = linesize * 2 */	\
842 	sllx	arg1, 1, arg1;		/* arg1 = size * 2 */		\
843 	sub	arg1, arg2, arg1;					\
844 	or	arg1, CH_ICTAG_LOWER, arg1;	/* "write" tag */	\
845 1:									\
846 	stxa	%g0, [arg1]ASI_IC_TAG;					\
847 	membar	#Sync;				/* Cheetah PRM 8.9.3 */	\
848 	cmp	arg1, CH_ICTAG_LOWER;					\
849 	bne,pt	%icc, 1b;						\
850 	  sub	arg1, arg2, arg1;					\
851 	stxa	tmp2, [%g0]ASI_DCU;					\
852 	flush	%g0;	/* flush required after changing the IC bit */
853 
854 
855 #if defined(JALAPENO) || defined(SERRANO)
856 
857 /*
858  * ASI access to the L2 tag or L2 flush can hang the cpu when interacting
859  * with combinations of L2 snoops, victims and stores.
860  *
861  * A possible workaround is to surround each L2 ASI access with membars
862  * and make sure that the code is hitting in the Icache.  This requires
863  * aligning code sequence at E$ boundary and forcing I$ fetch by
864  * jumping to selected offsets so that we don't take any I$ misses
865  * during ASI access to the L2 tag or L2 flush.  This also requires
866  * making sure that we don't take any interrupts or traps (such as
867  * fast ECC trap, I$/D$ tag parity error) which can result in eviction
868  * of this code sequence from I$, thus causing a miss.
869  *
870  * Because of the complexity/risk, we have decided to do a partial fix
871  * of adding membar around each ASI access to the L2 tag or L2 flush.
872  */
873 
874 #define	JP_EC_DIAG_ACCESS_MEMBAR	\
875 	membar	#Sync
876 
877 /*
878  * Jalapeno version of macro that flushes the entire Ecache.
879  *
880  * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
881  *
882  * arg1 = ecache size
883  * arg2 = ecache linesize - not modified; can be an immediate constant.
884  */
885 #define	ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)	\
886 	CPU_INDEX(tmp1, tmp2);						\
887 	set	JP_ECACHE_IDX_DISP_FLUSH, tmp2;				\
888 	sllx	tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;			\
889 	or	tmp1, tmp2, tmp1;					\
890 	srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;			\
891 1:									\
892 	subcc	tmp2, arg2, tmp2;					\
893 	JP_EC_DIAG_ACCESS_MEMBAR;					\
894 	ldxa	[tmp1 + tmp2]ASI_EC_DIAG, %g0;				\
895 	JP_EC_DIAG_ACCESS_MEMBAR;					\
896 	bg,pt	%xcc, 1b;						\
897 	  nop;								\
898 	mov	1, tmp2;						\
899 	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
900 	add	tmp1, tmp2, tmp1;					\
901 	mov	(JP_ECACHE_NWAY-1), tmp2;				\
902 	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
903 	andcc	tmp1, tmp2, tmp2;					\
904 	bnz,pt	%xcc, 1b;						\
905 	  srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2
906 
907 #else	/* JALAPENO || SERRANO */
908 
909 /*
910  * Cheetah version of macro that flushes the entire Ecache.
911  *
912  *  Need to displacement flush 2x ecache size from Ecache flush area.
913  *
914  * arg1 = ecache size
915  * arg2 = ecache linesize
916  * arg3 = ecache flush address - for cheetah only
917  */
918 #define	CH_ECACHE_FLUSHALL(arg1, arg2, arg3)				\
919 	sllx	arg1, 1, arg1;						\
920 1:									\
921 	subcc	arg1, arg2, arg1;					\
922 	bg,pt	%xcc, 1b;						\
923 	  ldxa	[arg1 + arg3]ASI_MEM, %g0;
924 
925 /*
926  * Cheetah+ version of macro that flushes the entire Ecache.
927  *
928  * Uses the displacement flush feature.
929  *
930  * arg1 = ecache size
931  * arg2 = ecache linesize
932  * impl = CPU implementation as returned from GET_CPU_IMPL()
933  *        The value in this register is destroyed during execution
934  *        of the macro.
935  */
936 #if defined(CHEETAH_PLUS)
937 #define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)				\
938 	cmp	impl, PANTHER_IMPL;					\
939 	bne	%xcc, 1f;						\
940 	  nop;								\
941 	set	PN_L3_IDX_DISP_FLUSH, impl;				\
942 	b	2f;							\
943 	  nop;								\
944 1:									\
945 	set	CHP_ECACHE_IDX_DISP_FLUSH, impl;			\
946 2:									\
947 	subcc	arg1, arg2, arg1;					\
948 	bg,pt	%xcc, 2b;						\
949 	  ldxa	[arg1 + impl]ASI_EC_DIAG, %g0;
950 #else	/* CHEETAH_PLUS */
951 #define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
952 #endif	/* CHEETAH_PLUS */
953 
954 /*
955  * Macro that flushes the entire Ecache.
956  *
957  * arg1 = ecache size
958  * arg2 = ecache linesize
959  * arg3 = ecache flush address - for cheetah only
960  */
961 #define	ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)				\
962 	GET_CPU_IMPL(tmp1);						\
963 	cmp	tmp1, CHEETAH_IMPL;					\
964 	bne	%xcc, 2f;						\
965 	  nop;								\
966 	CH_ECACHE_FLUSHALL(arg1, arg2, arg3);				\
967 	ba	3f;							\
968 	  nop;								\
969 2:									\
970 	CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);				\
971 3:
972 
973 #endif	/* JALAPENO || SERRANO */
974 
975 /*
976  * Macro that flushes the Panther L2 cache.
977  */
978 #if defined(CHEETAH_PLUS)
979 #define	PN_L2_FLUSHALL(scr1, scr2, scr3)				\
980 	GET_CPU_IMPL(scr3);						\
981 	cmp	scr3, PANTHER_IMPL;					\
982 	bne	%xcc, 2f;						\
983 	  nop;								\
984 	set	PN_L2_SIZE, scr1;					\
985 	set	PN_L2_LINESIZE, scr2;					\
986 	set	PN_L2_IDX_DISP_FLUSH, scr3;				\
987 1:									\
988 	subcc	scr1, scr2, scr1;					\
989 	bg,pt	%xcc, 1b;						\
990 	  ldxa	[scr1 + scr3]ASI_L2_TAG, %g0;				\
991 2:
992 #else	/* CHEETAH_PLUS */
993 #define	PN_L2_FLUSHALL(scr1, scr2, scr3)
994 #endif	/* CHEETAH_PLUS */
995 
996 /*
997  * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
998  * this macro returns the TLB index for that mapping based on a 512 entry
999  * (2-way set associative) TLB. Aaside from the 16 entry fully associative
1000  * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
1001  *
1002  * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
1003  * mask out all but the lower 8 bits because:
1004  *
1005  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
1006  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
1007  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
1008  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
1009  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
1010  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
1011  *
1012  * and
1013  *
1014  *    array index for   8K pages = VA[20:13]
1015  *    array index for  64K pages = VA[23:16]
1016  *    array index for 512K pages = VA[26:19]
1017  *    array index for   4M pages = VA[29:22]
1018  *    array index for  32M pages = VA[32:25]
1019  *    array index for 256M pages = VA[35:28]
1020  *
1021  * Inputs:
1022  *
1023  *    va	- Register.
1024  *		  Input: Virtual address in which we are interested.
1025  *		  Output: TLB index value.
1026  *    pg_sz	- Register. Page Size of the TLB in question as encoded
1027  *		  in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
1028  */
1029 #if defined(CHEETAH_PLUS)
1030 #define	PN_GET_TLB_INDEX(va, pg_sz)					\
1031 	srlx	va, 13, va;	/* first shift the 13 bits and then */	\
1032 	srlx	va, pg_sz, va;	/* shift by pg_sz three times. */	\
1033 	srlx	va, pg_sz, va;						\
1034 	srlx	va, pg_sz, va;						\
1035 	and	va, 0xff, va;	/* mask out all but the lower 8 bits */
1036 #endif	/* CHEETAH_PLUS */
1037 
1038 /*
1039  * The following macros are for error traps at TL>0.
1040  * The issue with error traps at TL>0 is that there are no safely
1041  * available global registers.  So we use the trick of generating a
1042  * software trap, then using the %tpc, %tnpc and %tstate registers to
1043  * temporarily save the values of %g1 and %g2.
1044  */
1045 
1046 /*
1047  * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
1048  * Does the following steps:
1049  *	1. membar #Sync - required for USIII family errors.
1050  *	2. Specified software trap.
1051  * NB: Must be 8 instructions or less to fit in trap table and code must
1052  *     be relocatable.
1053  */
1054 #define	CH_ERR_TL1_TRAPENTRY(trapno)		\
1055 	membar	#Sync;				\
1056 	ta	trapno;				\
1057 	nop; nop; nop; nop; nop; nop
1058 
1059 /*
1060  * Macro to generate 8-instruction trap table entry for TL>0 software trap.
1061  * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
1062  * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
1063  * we need to put the low-order two bits of %g1 and %g2 in %tstate).
1064  * Note that %tstate has a reserved hole from bits 3-7, so we put the
1065  * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
1066  * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
1067  * state bits).  Note that we must do a jmp instruction, since this
1068  * is moved into the trap table entry.
1069  * NB: Must be 8 instructions or less to fit in trap table and code must
1070  *     be relocatable.
1071  */
1072 #define	CH_ERR_TL1_SWTRAPENTRY(label)		\
1073 	wrpr	%g1, %tpc;			\
1074 	and	%g1, 3, %g1;			\
1075 	wrpr	%g2, %tnpc;			\
1076 	sllx	%g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
1077 	or	%g1, %g2, %g2;			\
1078 	sethi	%hi(label), %g1;		\
1079 	jmp	%g1+%lo(label);			\
1080 	  wrpr	%g2, %tstate
1081 
1082 /*
1083  * Macro to get ptr to ch_err_tl1_data.
1084  * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
1085  * will point to a kernel nucleus virtual address with ASI_N in %asi.
1086  * This allows us to:
1087  *   1. Avoid getting MMU misses.  We may have gotten the original
1088  *	Fast ECC error in an MMU handler and if we get an MMU trap
1089  *	in the TL>0 handlers, we'll scribble on the MMU regs.
1090  *   2. Allows us to use the same code in the TL>0 handlers whether
1091  *	we're accessing kernel nucleus virtual addresses or physical
1092  *	addresses.
1093  * pseudo-code:
1094  *	reg1 <- ch_err_tl1_paddrs[CPUID];
1095  *	if (reg1 == NULL) {
1096  *		reg1 <- &ch_err_tl1_data
1097  *		%asi <- ASI_N
1098  *	} else {
1099  *		reg1 <- reg1 + offset +
1100  *		    sizeof (ch_err_tl1_data) * (%tl - 3)
1101  *		%asi <- ASI_MEM
1102  *	}
1103  */
1104 #define	GET_CH_ERR_TL1_PTR(reg1, reg2, offset)	\
1105 	CPU_INDEX(reg1, reg2);			\
1106 	sllx	reg1, 3, reg1;			\
1107 	set	ch_err_tl1_paddrs, reg2;	\
1108 	ldx	[reg1+reg2], reg1;		\
1109 	brnz	reg1, 1f;			\
1110 	add	reg1, offset, reg1;		\
1111 	set	ch_err_tl1_data, reg1;		\
1112 	ba	2f;				\
1113 	wr	%g0, ASI_N, %asi;		\
1114 1:	rdpr	%tl, reg2;			\
1115 	sub	reg2, 3, reg2;			\
1116 	mulx	reg2, CH_ERR_TL1_DATA_SIZE, reg2;	\
1117 	add	reg1, reg2, reg1;		\
1118 	wr	%g0, ASI_MEM, %asi;		\
1119 2:
1120 
1121 /*
1122  * Macro to generate entry code for TL>0 error handlers.
1123  * At the end of this macro, %g1 will point to the ch_err_tl1_data
1124  * structure and %g2 will have the original flags in the ch_err_tl1_data
1125  * structure and %g5 will have the value of %tstate where the Fast ECC
1126  * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
1127  * All %g registers except for %g1, %g2 and %g5 will be available after
1128  * this macro.
1129  * Does the following steps:
1130  *   1. Compute physical address of per-cpu/per-tl save area using
1131  *	only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
1132  *	leaving address in %g1 and updating the %asi register.
1133  *	If there is no data area available, we branch to label.
1134  *   2. Save %g3-%g7 in save area.
1135  *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
1136  *	original %g1+%g2 values (because we're going to change %tl).
1137  *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
1138  *	running at %tl+1 as small as possible.
1139  *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
1140  *	%tstate (%g5) and save in save area, carefully preserving %g5
1141  *	because it has the CH_ERR_TSTATE_DC_ON value.
1142  *   6. Load existing ch_err_tl1_data flags in %g2
1143  *   7. Compute the new flags
1144  *   8. If %g2 is non-zero (the structure was busy), shift the new
1145  *	flags by CH_ERR_ME_SHIFT and or them with the old flags.
1146  *   9. Store the updated flags into ch_err_tl1_data flags.
1147  *   10. If %g2 is non-zero, read the %tpc and store it in
1148  *	ch_err_tl1_data.
1149  */
1150 #define	CH_ERR_TL1_ENTER(flags)			\
1151 	GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);	\
1152 	stxa	%g3, [%g1 + CH_ERR_TL1_G3]%asi;	\
1153 	stxa	%g4, [%g1 + CH_ERR_TL1_G4]%asi;	\
1154 	stxa	%g5, [%g1 + CH_ERR_TL1_G5]%asi;	\
1155 	stxa	%g6, [%g1 + CH_ERR_TL1_G6]%asi;	\
1156 	stxa	%g7, [%g1 + CH_ERR_TL1_G7]%asi;	\
1157 	rdpr	%tpc, %g3;			\
1158 	rdpr	%tnpc, %g4;			\
1159 	rdpr	%tstate, %g5;			\
1160 	rdpr	%tl, %g6;			\
1161 	sub	%g6, 1, %g6;			\
1162 	wrpr	%g6, %tl;			\
1163 	and	%g5, 3, %g6;			\
1164 	andn	%g3, 3, %g3;			\
1165 	or	%g3, %g6, %g3;			\
1166 	stxa	%g3, [%g1 + CH_ERR_TL1_G1]%asi;	\
1167 	srlx	%g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6;	\
1168 	and	%g6, 3, %g6;			\
1169 	andn	%g4, 3, %g4;			\
1170 	or	%g6, %g4, %g4;			\
1171 	stxa	%g4, [%g1 + CH_ERR_TL1_G2]%asi;	\
1172 	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1173 	set	flags | CH_ERR_TL, %g3;		\
1174 	brz	%g2, 9f;			\
1175 	sllx	%g3, CH_ERR_ME_SHIFT, %g4;	\
1176 	or	%g2, %g4, %g3;			\
1177 9:	stxa	%g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1178 	brnz	%g2, 8f;			\
1179 	rdpr	%tpc, %g4;			\
1180 	stxa	%g4, [%g1 + CH_ERR_TL1_TPC]%asi;	\
1181 8:
1182 
1183 /*
1184  * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
1185  * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
1186  * at TL>0 handlers because the D$ may have corrupted data and we need to
1187  * turn off the I$ to allow for diagnostic accesses.  We then invoke
1188  * the normal entry macro and after it is done we save the values of
1189  * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
1190  * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
1191  */
1192 #define	CH_ERR_TL1_FECC_ENTER			\
1193 	ldxa	[%g0]ASI_DCU, %g1;		\
1194 	andn	%g1, DCU_DC + DCU_IC, %g2;	\
1195 	stxa	%g2, [%g0]ASI_DCU;		\
1196 	flush	%g0;	/* DCU_IC need flush */	\
1197 	rdpr	%tstate, %g2;			\
1198 	and	%g1, DCU_DC + DCU_IC, %g1;	\
1199 	sllx	%g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;	\
1200 	or	%g1, %g2, %g2;			\
1201 	wrpr	%g2, %tstate;			\
1202 	CH_ERR_TL1_ENTER(CH_ERR_FECC);		\
1203 	and	%g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;	\
1204 	stxa	%g5, [%g1 + CH_ERR_TL1_TMP]%asi
1205 
1206 /*
1207  * Macro to generate exit code for TL>0 error handlers.
1208  * We fall into this macro if we've successfully logged the error in
1209  * the ch_err_tl1_data structure and want the PIL15 softint to pick
1210  * it up and log it.
1211  * Does the following steps:
1212  *   1.	Set pending flag for this cpu in ch_err_tl1_pending.
1213  *   2.	Write %set_softint with (1<<pil) to cause a pil level trap
1214  *   3.	Restore registers from ch_err_tl1_data, which is pointed to
1215  *	by %g1, last register to restore is %g1 since it's pointing
1216  *	to the save area.
1217  *   4. Execute retry
1218  */
1219 #define	CH_ERR_TL1_EXIT				\
1220 	CPU_INDEX(%g2, %g3);			\
1221 	set	ch_err_tl1_pending, %g3;	\
1222 	set	-1, %g4;			\
1223 	stb	%g4, [%g2 + %g3];		\
1224 	mov	1, %g2;				\
1225 	sll	%g2, PIL_15, %g2;		\
1226 	wr	%g2, SET_SOFTINT;		\
1227 	ldxa	[%g1 + CH_ERR_TL1_G7]%asi, %g7;	\
1228 	ldxa	[%g1 + CH_ERR_TL1_G6]%asi, %g6;	\
1229 	ldxa	[%g1 + CH_ERR_TL1_G5]%asi, %g5;	\
1230 	ldxa	[%g1 + CH_ERR_TL1_G4]%asi, %g4;	\
1231 	ldxa	[%g1 + CH_ERR_TL1_G3]%asi, %g3;	\
1232 	ldxa	[%g1 + CH_ERR_TL1_G2]%asi, %g2;	\
1233 	ldxa	[%g1 + CH_ERR_TL1_G1]%asi, %g1;	\
1234 	retry
1235 
1236 /*
1237  * Generates unrecoverable error label for TL>0 handlers.
1238  * At label (Unrecoverable error routine)
1239  *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
1240  *	argument to cpu_tl1_err_panic).
1241  *   2.	Call cpu_tl1_err_panic via systrap at PIL 15
1242  */
1243 #define	CH_ERR_TL1_PANIC_EXIT(label)		\
1244 label:	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1245 	or	%g2, CH_ERR_TL | CH_ERR_PANIC, %g2;	\
1246 	stxa	%g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1247 	set	cpu_tl1_err_panic, %g1;		\
1248 	ba	sys_trap;			\
1249 	  mov	PIL_15, %g4
1250 
1251 
1252 
1253 /* END CSTYLED */
1254 #endif	/* _ASM */
1255 
1256 #ifdef	__cplusplus
1257 }
1258 #endif
1259 
1260 #endif /* _CHEETAHASM_H */
1261