xref: /titanic_50/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_Lxcacheerr.c (revision a62774df315360f02521d6470eab7d5080137dad)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 /*
28  * Support routines for managing per-Lxcache state.
29  */
30 
31 #include <sys/types.h>
32 #include <errno.h>
33 #include <strings.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <stropts.h>
38 #include <fm/fmd_api.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/cpu/UltraSPARC-III.h>
41 #include <sys/cpuvar.h>
42 #include <cmd_Lxcache.h>
43 #include <cmd_mem.h>
44 #include <cmd_cpu.h>
45 #include <cmd_state.h>
46 #include <cmd.h>
47 #define	_KERNEL
48 #include <sys/cheetahregs.h>
49 #include <sys/mem_cache.h>
50 #undef _KERNEL
51 #include <sys/errclassify.h>
52 #include <sys/fm/io/sun4upci.h>
53 
54 #include <fmd_adm.h>
55 #include <fmd_adm_impl.h>
56 #include <fmd_rpc_adm.h>
57 
58 #define	PN_CACHE_ERRORS (CMD_ERRCL_UCC | CMD_ERRCL_WDC | \
59 			    CMD_ERRCL_CPC | CMD_ERRCL_EDC | \
60 			    CMD_ERRCL_L3_UCC | CMD_ERRCL_L3_CPC |\
61 			    CMD_ERRCL_L3_WDC | CMD_ERRCL_L3_EDC)
62 
63 /* Note that these are the same for panther L2 and L3 (see prm) */
64 
65 #define	LX_INDEX_MASK		PN_L2_INDEX_MASK
66 #define	LX_INDEX_SHIFT		6
67 #define	PN_ECSTATE_NA	5
68 #define	PN_ECSTATE_INV	0
69 
70 #define	PN_L3_INDEX_MASK	PN_L3_TAG_RD_MASK
71 
72 static const errdata_t l3errdata =
73 	{ &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_LxCACHE_CASE };
74 static const errdata_t l2errdata =
75 	{ &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_LxCACHE_CASE };
76 
77 /* Macro for putting 64-bit onto stack as two 32-bit ints */
78 #define	PRTF_64_TO_32(x)	(uint32_t)((x)>>32), (uint32_t)(x)
79 
80 #define	LX_PA_MASK2_32BIT_CORRECT	16
81 #define	LX_PA_MASK3_32BIT_CORRECT	24
82 #define	LX_PA_MASK2 0x7fffff8
83 #define	LX_PA_MASK3 0x7ffff8
84 
85 
86 #define	MAX_RETRIES_FOR_ECC_MATCH	3
87 #define	PN_TAG_ECC_MASK 0x7fc0
88 #define	PN_L2_PTAG_SHIFT	19
89 #define	PN_L3_PTAG_SHIFT	24
90 #define	L2_PTAG_MASK		0xffffff
91 #define	L3_PTAG_MASK		0xfffff
92 #define	BIT_MASK		0x7f
93 #define	MSB_BIT			0x8000
94 #define	SET_MSB_BIT		0x8000
95 #define	CLEAR_MSB_BIT		0x7fff
96 #define	PN_LX_TAG_ECC_START_BIT	6
97 #define	PN_LX_TAG_ECC_END_BIT	14
98 #define	PN_LX_STATE_END_BIT	2
99 #define	PN_LX_NUM_OF_BITS_IN_ECC	9
100 
101 #define	LX_NWAYS		4
102 
103 int test_mode = 0;	/* should be 0 in production version. */
104 #define	FM_EREPORT_RECHECK_OF_TAGS "recheck_tags"
105 #define	RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO	3
106 uint32_t cmd_Lxcache_recheck_tags_delay
107 	[RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO + 1] = {0, 1, 2, 4};
108 
109 /*
110  * e (for ecctable) maps single bit positions (0-127, or 0-0x7F) to the
111  * corresponding ECC syndromes for an error in that position.
112  */
113 int e[] = {
114 	/* From Table P-4, JPS1 US-III Supplement */
115 		/* 0	1	2	3	4	5	6	7 */
116 /* 00 */	0x03B,	0x127,	0x067,	0x097,	0x10F,	0x08F,	0x04F,	0x02C,
117 /* 08 */	0x147,	0x0C7,	0x02F,	0x01C,	0x117,	0x032,	0x08A,	0x04A,
118 /* 10 */	0x01F,	0x086,	0x046,	0x026,	0x09B,	0x08C,	0x0C1,	0x0A1,
119 /* 18 */	0x01A,	0x016,	0x061,	0x091,	0x052,	0x00E,	0x109,	0x029,
120 /* 20 */	0x02A,	0x019,	0x105,	0x085,	0x045,	0x025,	0x015,	0x103,
121 /* 28 */	0x031,	0x00D,	0x083,	0x043,	0x051,	0x089,	0x023,	0x007,
122 /* 30 */	0x0B9,	0x049,	0x013,	0x0A7,	0x057,	0x00B,	0x07A,	0x187,
123 /* 38 */	0x0F8,	0x11B,	0x079,	0x034,	0x178,	0x1D8,	0x05B,	0x04C,
124 /* 40 */	0x064,	0x1B4,	0x037,	0x03D,	0x058,	0x13C,	0x1B1,	0x03E,
125 /* 48 */	0x1C3,	0x0BC,	0x1A0,	0x1D4,	0x1CA,	0x190,	0x124,	0x13A,
126 /* 50 */	0x1C0,	0x188,	0x122,	0x114,	0x184,	0x182,	0x160,	0x118,
127 /* 58 */	0x181,	0x150,	0x148,	0x144,	0x142,	0x141,	0x130,	0x0A8,
128 /* 60 */	0x128,	0x121,	0x0E0,	0x094,	0x112,	0x10C,	0x0D0,	0x0B0,
129 /* 68 */	0x10A,	0x106,	0x062,	0x1B2,	0x0C8,	0x0C4,	0x0C2,	0x1F0,
130 /* 70 */	0x0A4,	0x0A2,	0x098,	0x1D1,	0x070,	0x1E8,	0x1C6,	0x1C5,
131 /* 78 */	0x068,	0x1E4,	0x1E2,	0x1E1,	0x1D2,	0x1CC,	0x1C9,	0x1B8,
132 	/* Now we have the check bits */
133 	/* C0	C1	C2	C3	C4	C5	C6	C7	C8 */
134 	0x001,	0x002,	0x004,	0x008,	0x010,	0x020,	0x040,	0x080,	0x100,
135 };
136 
137 #define	NBITS (sizeof (e)/sizeof (e[0]))
138 #define	NDATABITS (128)
139 /*
140  * This table is used to determine which bit(s) is(are) bad when an ECC
141  * error occurs.  The array is indexed by an 9-bit syndrome.  The entries
142  * of this array have the following semantics:
143  *
144  *      00-127  The number of the bad bit, when only one bit is bad.
145  *      128     ECC bit C0 is bad.
146  *      129     ECC bit C1 is bad.
147  *      130     ECC bit C2 is bad.
148  *      131     ECC bit C3 is bad.
149  *      132     ECC bit C4 is bad.
150  *      133     ECC bit C5 is bad.
151  *      134     ECC bit C6 is bad.
152  *      135     ECC bit C7 is bad.
153  *      136     ECC bit C8 is bad.
154  *	137-143 reserved for Mtag Data and ECC.
155  *      144(M2) Two bits are bad within a nibble.
156  *      145(M3) Three bits are bad within a nibble.
157  *      146(M3) Four bits are bad within a nibble.
158  *      147(M)  Multiple bits (5 or more) are bad.
159  *      148     NO bits are bad.
160  * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
161  */
162 
163 #define	C0	128
164 #define	C1	129
165 #define	C2	130
166 #define	C3	131
167 #define	C4	132
168 #define	C5	133
169 #define	C6	134
170 #define	C7	135
171 #define	C8	136
172 #define	MT0	137	/* Mtag Data bit 0 */
173 #define	MT1	138
174 #define	MT2	139
175 #define	MTC0	140	/* Mtag Check bit 0 */
176 #define	MTC1	141
177 #define	MTC2	142
178 #define	MTC3	143
179 #define	M2	144
180 #define	M3	145
181 #define	M4	146
182 #define	M	147
183 #define	NA	148
184 #if defined(JALAPENO) || defined(SERRANO)
185 #define	S003	149	/* Syndrome 0x003 => likely from CPU/EDU:ST/FRU/BP */
186 #define	S003MEM	150	/* Syndrome 0x003 => likely from WDU/WBP */
187 #define	SLAST	S003MEM	/* last special syndrome */
188 #else /* JALAPENO || SERRANO */
189 #define	S003	149	/* Syndrome 0x003 => likely from EDU:ST */
190 #define	S071	150	/* Syndrome 0x071 => likely from WDU/CPU */
191 #define	S11C	151	/* Syndrome 0x11c => likely from BERR/DBERR */
192 #define	SLAST	S11C	/* last special syndrome */
193 #endif /* JALAPENO || SERRANO */
194 #if defined(JALAPENO) || defined(SERRANO)
195 #define	BPAR0	152	/* syndrom 152 through 167 for bus parity */
196 #define	BPAR15	167
197 #endif	/* JALAPENO || SERRANO */
198 
199 static uint8_t ecc_syndrome_tab[] =
200 {
201 NA,  C0,  C1, S003, C2,  M2,  M3,  47,  C3,  M2,  M2,  53,  M2,  41,  29,   M,
202 C4,   M,   M,  50,  M2,  38,  25,  M2,  M2,  33,  24,  M2,  11,   M,  M2,  16,
203 C5,   M,   M,  46,  M2,  37,  19,  M2,   M,  31,  32,   M,   7,  M2,  M2,  10,
204 M2,  40,  13,  M2,  59,   M,  M2,  66,   M,  M2,  M2,   0,  M2,  67,  71,   M,
205 C6,   M,   M,  43,   M,  36,  18,   M,  M2,  49,  15,   M,  63,  M2,  M2,   6,
206 M2,  44,  28,  M2,   M,  M2,  M2,  52,  68,  M2,  M2,  62,  M2,  M3,  M3,  M4,
207 M2,  26, 106,  M2,  64,   M,  M2,   2, 120,   M,  M2,  M3,   M,  M3,  M3,  M4,
208 #if defined(JALAPENO) || defined(SERRANO)
209 116, M2,  M2,  M3,  M2,  M3,   M,  M4,  M2,  58,  54,  M2,   M,  M4,  M4,  M3,
210 #else	/* JALAPENO || SERRANO */
211 116, S071, M2,  M3,  M2,  M3,   M,  M4,  M2,  58,  54,  M2,   M,  M4,  M4,  M3,
212 #endif	/* JALAPENO || SERRANO */
213 C7,  M2,   M,  42,   M,  35,  17,  M2,   M,  45,  14,  M2,  21,  M2,  M2,   5,
214 M,   27,   M,   M,  99,   M,   M,   3, 114,  M2,  M2,  20,  M2,  M3,  M3,   M,
215 M2,  23, 113,  M2, 112,  M2,   M,  51,  95,   M,  M2,  M3,  M2,  M3,  M3,  M2,
216 103,  M,  M2,  M3,  M2,  M3,  M3,  M4,  M2,  48,   M,   M,  73,  M2,   M,  M3,
217 M2,  22, 110,  M2, 109,  M2,   M,   9, 108,  M2,   M,  M3,  M2,  M3,  M3,   M,
218 102, M2,   M,   M,  M2,  M3,  M3,   M,  M2,  M3,  M3,  M2,   M,  M4,   M,  M3,
219 98,   M,  M2,  M3,  M2,   M,  M3,  M4,  M2,  M3,  M3,  M4,  M3,   M,   M,   M,
220 M2,  M3,  M3,   M,  M3,   M,   M,   M,  56,  M4,   M,  M3,  M4,   M,   M,   M,
221 C8,   M,  M2,  39,   M,  34, 105,  M2,   M,  30, 104,   M, 101,   M,   M,   4,
222 #if defined(JALAPENO) || defined(SERRANO)
223 M,    M, 100,   M,  83,   M,  M2,  12,  87,   M,   M,  57,  M2,   M,  M3,   M,
224 #else	/* JALAPENO || SERRANO */
225 M,    M, 100,   M,  83,   M,  M2,  12,  87,   M,   M,  57, S11C,  M,  M3,   M,
226 #endif	/* JALAPENO || SERRANO */
227 M2,  97,  82,  M2,  78,  M2,  M2,   1,  96,   M,   M,   M,   M,   M,  M3,  M2,
228 94,   M,  M2,  M3,  M2,   M,  M3,   M,  M2,   M,  79,   M,  69,   M,  M4,   M,
229 M2,  93,  92,   M,  91,   M,  M2,   8,  90,  M2,  M2,   M,   M,   M,   M,  M4,
230 89,   M,   M,  M3,  M2,  M3,  M3,   M,   M,   M,  M3,  M2,  M3,  M2,   M,  M3,
231 86,   M,  M2,  M3,  M2,   M,  M3,   M,  M2,   M,  M3,   M,  M3,   M,   M,  M3,
232 M,    M,  M3,  M2,  M3,  M2,  M4,   M,  60,   M,  M2,  M3,  M4,   M,   M,  M2,
233 M2,  88,  85,  M2,  84,   M,  M2,  55,  81,  M2,  M2,  M3,  M2,  M3,  M3,  M4,
234 77,   M,   M,   M,  M2,  M3,   M,   M,  M2,  M3,  M3,  M4,  M3,  M2,   M,   M,
235 74,   M,  M2,  M3,   M,   M,  M3,   M,   M,   M,  M3,   M,  M3,   M,  M4,  M3,
236 M2,  70, 107,  M4,  65,  M2,  M2,   M, 127,   M,   M,   M,  M2,  M3,  M3,   M,
237 80,  M2,  M2,  72,   M, 119, 118,   M,  M2, 126,  76,   M, 125,   M,  M4,  M3,
238 M2, 115, 124,   M,  75,   M,   M,  M3,  61,   M,  M4,   M,  M4,   M,   M,   M,
239 M,  123, 122,  M4, 121,  M4,   M,  M3, 117,  M2,  M2,  M3,  M4,  M3,   M,   M,
240 111,  M,   M,   M,  M4,  M3,  M3,   M,   M,   M,  M3,   M,  M3,  M2,   M,   M
241 };
242 
243 #define	ESYND_TBL_SIZE	(sizeof (ecc_syndrome_tab) / sizeof (uint8_t))
244 
245 int8_t L2TAG_bit_to_way_map[128] = {
246 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
247 /* 1 */ 0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  0,  0,  0,  0,
248 /* 2 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
249 /* 3 */ 0,  0,  0,  0,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
250 /* 4 */ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -1, -1, -1, -1,
251 /* 5 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
252 /* 6 */ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
253 /* 7 */ 1,  1,  1,  1,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
254 /* 8 */ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -1, -1, -1, -1,
255 };
256 
257 uint8_t L2TAG_bit_to_way_bit[128] = {
258 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
259 /* 1 */ 0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  19, 20, 21, 22,
260 /* 2 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
261 /* 3 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
262 /* 4 */31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, C0, C0, C0, C0,
263 /* 5 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 19, 20, 21, 22,
264 /* 6 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
265 /* 7 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
266 /* 8 */31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, C0, C0, C0, C0,
267 };
268 
269 int8_t L3TAG_bit_to_way_map[128] = {
270 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
271 /* 1 */ 1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,
272 /* 2 */ 1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,
273 /* 3 */ 1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3, -1, -1,
274 /* 4 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
275 /* 5 */ 0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,
276 /* 6 */ 0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,
277 /* 7 */ 0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2, -1, -1,
278 /* 8 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
279 };
280 
281 uint8_t L3TAG_bit_to_way_bit[128] = {
282 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
283 /* 1 */ 0,  0,  1,  1,  2,  2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
284 /* 2 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
285 /* 3 */37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, C0, C0,
286 /* 4 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
287 /* 5 */ 0,  0,  1,  1,  2,  2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
288 /* 6 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
289 /* 7 */37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, C0, C0,
290 /* 8 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
291 };
292 
293 uint16_t
calcecc(uint64_t chi,uint64_t clo)294 calcecc(uint64_t chi, uint64_t clo)
295 {
296 	int i;
297 	uint64_t syndrome = 0;
298 
299 	for (i = 0; i < (NDATABITS/2); i++) {
300 		syndrome ^= ((chi & 1) ? e[(NDATABITS/2) + i] : 0) ^
301 		    ((clo & 1) ? e[i] : 0);
302 		chi >>= 1;
303 		clo >>= 1;
304 	}
305 	return (uint16_t)(syndrome);
306 }
307 
308 uint64_t
calcsynd(uint64_t chi,uint64_t clo,uint64_t ecc)309 calcsynd(uint64_t chi, uint64_t clo, uint64_t ecc)
310 {
311 	return (calcecc(chi, clo) ^ ecc);
312 }
313 
314 static uint8_t
tag_bit_to_way_bit(cmd_ptrsubtype_t pstype,int16_t tag_bit)315 tag_bit_to_way_bit(cmd_ptrsubtype_t pstype, int16_t tag_bit)
316 {
317 	uint8_t way_bit = C0;
318 
319 	switch (pstype) {
320 		case CMD_PTR_CPU_L2TAG:
321 			way_bit = L2TAG_bit_to_way_bit[tag_bit];
322 			break;
323 		case CMD_PTR_CPU_L3TAG:
324 			way_bit = L3TAG_bit_to_way_bit[tag_bit];
325 			break;
326 	}
327 	return (way_bit);
328 }
329 
330 static int8_t
bit_to_way(cmd_ptrsubtype_t pstype,uint32_t bit)331 bit_to_way(cmd_ptrsubtype_t pstype, uint32_t bit)
332 {
333 	int8_t way = -1;
334 
335 	switch (pstype) {
336 		case CMD_PTR_CPU_L2TAG:
337 			way = L2TAG_bit_to_way_map[bit & BIT_MASK];
338 			break;
339 		case CMD_PTR_CPU_L3TAG:
340 			way = L3TAG_bit_to_way_map[bit & BIT_MASK];
341 			break;
342 	}
343 	return (way);
344 }
345 
346 static int32_t
get_index(cmd_ptrsubtype_t pstype,uint64_t tag_afar)347 get_index(cmd_ptrsubtype_t pstype, uint64_t tag_afar)
348 {
349 	int32_t	index = -1;
350 
351 	switch (pstype) {
352 		case CMD_PTR_CPU_L2TAG:
353 			index = (int32_t)((tag_afar & PN_L2_INDEX_MASK)
354 			    >> PN_CACHE_LINE_SHIFT);
355 			break;
356 		case CMD_PTR_CPU_L3TAG:
357 			index = (int32_t)((tag_afar & PN_L3_TAG_RD_MASK)
358 			    >> PN_CACHE_LINE_SHIFT);
359 			break;
360 	}
361 	return (index);
362 }
363 
364 static int
get_retired_ways(uint64_t * tag_data)365 get_retired_ways(uint64_t *tag_data)
366 {
367 	int		i, retired_ways;
368 
369 	retired_ways = 0;
370 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
371 		if ((tag_data[i] & CH_ECSTATE_MASK) ==
372 		    PN_ECSTATE_NA)
373 			retired_ways++;
374 	}
375 	return (retired_ways);
376 }
377 
378 static cmd_evdisp_t
extract_data_from_ereport_payload(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,uint64_t * afarp,uint64_t * tag_data,const char * fltnm)379 extract_data_from_ereport_payload(fmd_hdl_t *hdl, nvlist_t *nvl,
380 				    cmd_cpu_t *cpu,
381 				    cmd_ptrsubtype_t pstype,
382 				    uint64_t *afarp, uint64_t *tag_data,
383 				    const char *fltnm)
384 {
385 	ch_ec_data_t	*ec_data;
386 	char		*payload_namep;
387 	int		tag_afar_status;
388 	uint64_t	tag_afar;
389 	int		i;
390 	uint_t		sz;
391 	int32_t	index;
392 	int32_t		recheck_of_tags;
393 
394 	tag_afar_status = cmd_afar_valid(hdl, nvl, 0, &tag_afar);
395 	if (tag_afar_status == -1) {
396 		fmd_hdl_debug(hdl,
397 		    "\n%s:cpu_id = %d Invalid afar status in nvlist\n",
398 		    fltnm, cpu->cpu_cpuid);
399 		return (CMD_EVD_BAD);
400 	}
401 	*afarp = tag_afar;
402 	index = get_index(pstype, tag_afar);
403 	switch (pstype) {
404 		case CMD_PTR_CPU_L2TAG:
405 			payload_namep = FM_EREPORT_PAYLOAD_NAME_L2_DATA;
406 			break;
407 		case CMD_PTR_CPU_L3TAG:
408 			payload_namep = FM_EREPORT_PAYLOAD_NAME_L3_DATA;
409 			break;
410 		default:
411 			return (CMD_EVD_BAD);
412 	}
413 	if (nvlist_lookup_int32(nvl, FM_EREPORT_RECHECK_OF_TAGS,
414 	    &recheck_of_tags) != 0)
415 		recheck_of_tags = 0;
416 	if ((recheck_of_tags) || (test_mode))
417 		return (get_tagdata(cpu, pstype, index, tag_data));
418 	if (nvlist_lookup_uint64_array(nvl, payload_namep,
419 	    (uint64_t **)&ec_data, &sz) != 0) {
420 		fmd_hdl_debug(hdl,
421 		    "\n%s: cpu_id = %d index = %d could not find %s"
422 		    " in nvlist\n",
423 		    fltnm, cpu->cpu_cpuid, index, payload_namep);
424 		fmd_hdl_debug(hdl,
425 		    "\n%s: cpu_id = %d Reading tag data through"
426 		    " mem_cache driver.\n",
427 		    fltnm, cpu->cpu_cpuid);
428 		return (get_tagdata(cpu, pstype, index,
429 		    tag_data));
430 	}
431 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
432 		tag_data[i] = ec_data[i].ec_tag;
433 	}
434 	return (CMD_EVD_OK);
435 }
436 
437 static void
print_ecc(fmd_hdl_t * hdl,cmd_cpu_t * cpu,const char * fltnm,uint64_t * tag_data)438 print_ecc(fmd_hdl_t *hdl, cmd_cpu_t *cpu, const char *fltnm, uint64_t *tag_data)
439 {
440 	int	i;
441 	uint16_t	tag_ecc[PN_CACHE_NWAYS];
442 
443 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
444 		tag_ecc[i] =
445 		    ((tag_data[i] & PN_TAG_ECC_MASK)
446 		    >> PN_LX_TAG_ECC_START_BIT);
447 	}
448 	fmd_hdl_debug(hdl,
449 	    "\n%s: cpu_id = %d ecc[0] = 0x%03x ecc[1] = 0x%03x"
450 	    " ecc[2] = 0x%03x ecc[3] = 0x%03x\n",
451 	    fltnm, cpu->cpu_cpuid, tag_ecc[0], tag_ecc[1], tag_ecc[2],
452 	    tag_ecc[3]);
453 
454 }
455 
456 static int
matching_ecc(uint64_t * tag_data)457 matching_ecc(uint64_t *tag_data)
458 {
459 	int	i;
460 	uint16_t	tag_ecc[PN_CACHE_NWAYS];
461 
462 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
463 		tag_ecc[i] =
464 		    ((tag_data[i] & PN_TAG_ECC_MASK)
465 		    >> PN_LX_TAG_ECC_START_BIT);
466 		if (tag_ecc[i] != tag_ecc[0]) {
467 			return (1);
468 		}
469 	}
470 	return (0);
471 }
472 
473 static void
gen_data_for_ecc(uint64_t * tag_data,uint64_t * data_for_ecc_gen,cmd_ptrsubtype_t pstype)474 gen_data_for_ecc(uint64_t *tag_data, uint64_t *data_for_ecc_gen,
475 		    cmd_ptrsubtype_t pstype)
476 {
477 	uint64_t	ptag[PN_CACHE_NWAYS];
478 	uint8_t		state[PN_CACHE_NWAYS];
479 	int		i;
480 	uint16_t	tag_ecc[PN_CACHE_NWAYS];
481 	uint8_t		bit_position;
482 
483 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
484 		state[i] = tag_data[i] & CH_ECSTATE_MASK;
485 		tag_ecc[i] =
486 		    ((tag_data[i] & PN_TAG_ECC_MASK)
487 		    >> PN_LX_TAG_ECC_START_BIT);
488 		switch (pstype) {
489 			case CMD_PTR_CPU_L2TAG:
490 				ptag[i] = (tag_data[i] >> PN_L2_PTAG_SHIFT) &
491 				    L2_PTAG_MASK;
492 				break;
493 			case CMD_PTR_CPU_L3TAG:
494 				ptag[i] = (tag_data[i] >> PN_L3_PTAG_SHIFT) &
495 				    L3_PTAG_MASK;
496 				break;
497 		}
498 	}
499 	/*
500 	 * We now assemble the 128 bit data swizzling the Physical tags
501 	 * and states we obtained for all the 4 ways.
502 	 */
503 	data_for_ecc_gen[0] = 0;	/* high order 64 bits */
504 	data_for_ecc_gen[1] = 0;	/* low order 64 bits */
505 	switch (pstype) {
506 		case CMD_PTR_CPU_L2TAG:
507 			data_for_ecc_gen[1] = state[0];	/* way 0 state */
508 			data_for_ecc_gen[1] |=
509 			    (state[1] << 3); /* way 1 state */
510 			data_for_ecc_gen[1] |=
511 			    (state[2] << 6); /* way 2 state */
512 			data_for_ecc_gen[1] |=
513 			    (state[3] << 9); /* way 3 state */
514 			data_for_ecc_gen[1] |= (ptag[0] << 12); /* way 0 ptag */
515 			data_for_ecc_gen[1] |= (ptag[2] << 36); /* way 2 ptag */
516 			/* bits 63:60 of low order 64 bits are 0s */
517 
518 			/*
519 			 * We now start with hig order 64 bits.
520 			 * the low 12 bits are 0s
521 			 */
522 			data_for_ecc_gen[0] |= (ptag[1] << 12); /* way 1 ptag */
523 			data_for_ecc_gen[0] |= (ptag[3] << 36); /* way 3 ptag */
524 			break;
525 		case CMD_PTR_CPU_L3TAG:
526 			bit_position = 0;
527 			/*
528 			 * Swizzle state bits for way 1 and way 3
529 			 */
530 			for (i = 0; i < 3; i++) {
531 				data_for_ecc_gen[1] |=
532 				    (((state[1] >> i) & 1) << bit_position);
533 				bit_position++;
534 				data_for_ecc_gen[1] |=
535 				    (((state[3] >> i) & 1) << bit_position);
536 				bit_position++;
537 			}
538 			/*
539 			 * Swizzle physical tag bits for way 1 and way 3
540 			 */
541 			for (i = 0; i < 20; i++) {
542 				data_for_ecc_gen[1] |=
543 				    (((ptag[1] >> i) & 1) << bit_position);
544 				bit_position++;
545 				data_for_ecc_gen[1] |=
546 				    (((ptag[3] >> i) & 1) << bit_position);
547 				bit_position++;
548 			}
549 			/*
550 			 * start the high order 64 bits.
551 			 */
552 			bit_position = 0;
553 			/*
554 			 * Swizzle state bits for way 0 and way 2
555 			 */
556 			for (i = 0; i < 3; i++) {
557 				data_for_ecc_gen[0] |=
558 				    (((state[0] >> i) & 1) << bit_position);
559 				bit_position++;
560 				data_for_ecc_gen[0] |=
561 				    (((state[2] >> i) & 1) << bit_position);
562 				bit_position++;
563 			}
564 			/*
565 			 * Swizzle physical tag bits for way 0 and way 2
566 			 */
567 			for (i = 0; i < 20; i++) {
568 				data_for_ecc_gen[0] |=
569 				    (((ptag[0] >> i) & 1) << bit_position);
570 				bit_position++;
571 				data_for_ecc_gen[0] |=
572 				    (((ptag[2] >> i) & 1) << bit_position);
573 				bit_position++;
574 			}
575 			break;
576 	}
577 }
578 
579 static uint16_t
compute_syndrome(uint64_t * tag_data,cmd_ptrsubtype_t pstype)580 compute_syndrome(uint64_t *tag_data, cmd_ptrsubtype_t pstype)
581 {
582 	uint64_t	tag_synd;
583 	uint64_t	data_for_ecc_gen[2];
584 	uint16_t	tag_ecc;
585 
586 	gen_data_for_ecc(tag_data, data_for_ecc_gen, pstype);
587 	tag_ecc = ((tag_data[0] & PN_TAG_ECC_MASK) >> PN_LX_TAG_ECC_START_BIT);
588 	tag_synd = calcsynd(data_for_ecc_gen[0], data_for_ecc_gen[1],
589 	    (uint64_t)tag_ecc);
590 	return (tag_synd);
591 }
592 
593 static int16_t
find_bit_stickiness(uint64_t * tag_data,int8_t way,int16_t bit)594 find_bit_stickiness(uint64_t *tag_data, int8_t way, int16_t bit)
595 {
596 	int16_t	sticky_bit;
597 
598 	sticky_bit = bit;
599 	if ((tag_data[way] & ((uint64_t)1 << bit)) != 0)
600 		sticky_bit |= MSB_BIT;
601 	return (sticky_bit);
602 }
603 
604 static cmd_Lxcache_t *
cmd_create_and_destroy_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache)605 cmd_create_and_destroy_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
606 	cmd_Lxcache_t *Lxcache)
607 {
608 	const char		*fltnm;
609 	cmd_Lxcache_t	*new_Lxcache;
610 
611 	fltnm = cmd_type_to_str(Lxcache->Lxcache_type);
612 
613 	/*
614 	 * We first create a new Lxcache and add the event ep
615 	 * that is in Lxcache to the new case we create.
616 	 * we then destroy the Lxcache that has the event ep in its SERD engine.
617 	 */
618 	new_Lxcache = cmd_Lxcache_create(hdl, Lxcache->xr, cpu,
619 	    cpu->cpu_asru_nvl,
620 	    Lxcache->Lxcache_type,
621 	    Lxcache->Lxcache_index, Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
622 	if (new_Lxcache == NULL) {
623 		fmd_hdl_debug(hdl,
624 		    "\n%s:cpu_id %d:Failed to create a Lxcache for"
625 		    " index %d way %d bit %d\n",
626 		    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_index,
627 		    Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
628 		return (NULL);
629 	}
630 	(void) cmd_create_case_for_Lxcache(hdl, cpu, new_Lxcache);
631 	cmd_Lxcache_destroy(hdl, cpu, Lxcache);
632 	return (new_Lxcache);
633 }
634 
635 int
cmd_Lxcache_retire_as_reason(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache,const char * fltnm,int32_t reason)636 cmd_Lxcache_retire_as_reason(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
637     cmd_Lxcache_t *Lxcache, const char *fltnm, int32_t reason)
638 {
639 	boolean_t	ret;
640 	uint_t		certainty;
641 
642 	if (reason == CMD_LXSUSPECT_0_TAG) {
643 		/*
644 		 * clear MSB bit to retire as SUSPECT_0_TAG
645 		 * We need to update the Lxcache asru to reflect
646 		 * the change in bit value.
647 		 */
648 		Lxcache->Lxcache_bit &= CLEAR_MSB_BIT;
649 		errno = nvlist_add_uint16(
650 		    Lxcache->Lxcache_asru_nvl,
651 		    FM_FMRI_CPU_CACHE_BIT,
652 		    Lxcache->Lxcache_bit);
653 		if (errno) {
654 			fmd_hdl_debug(hdl,
655 			    "\n%s:cpu_id %d: failed to update",
656 			    " CACHE_BIT in asru.\n",
657 			    fltnm, cpu->cpu_cpuid);
658 			return (CMD_EVD_BAD);
659 		}
660 	}
661 	if (reason == CMD_LXCONVICTED)
662 		certainty = HUNDRED_PERCENT;
663 	else
664 		certainty = SUSPECT_PERCENT;
665 	ret = cmd_Lxcache_retire(hdl, cpu, Lxcache, fltnm, certainty);
666 	if (reason == CMD_LXSUSPECT_0_TAG)
667 		Lxcache->Lxcache_bit |= SET_MSB_BIT;
668 	if (ret == B_FALSE)
669 		return (CMD_EVD_BAD);
670 	Lxcache->Lxcache_reason = reason;
671 	/*
672 	 * Update the persistence storage of
673 	 * Lxcache.
674 	 */
675 	fmd_hdl_debug(hdl,
676 	    "\n%s:cpu_id %d:reason = %s flags = %s\n",
677 	    fltnm, cpu->cpu_cpuid,
678 	    cmd_reason_to_str(Lxcache->Lxcache_reason),
679 	    cmd_flags_to_str(Lxcache->Lxcache_flags));
680 	cmd_Lxcache_write(hdl, Lxcache);
681 	return (CMD_EVD_OK);
682 }
683 
684 int
retire_lowest_retirable_way_as_suspect(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * anonymous_Lxcache,const char * fltnm)685 retire_lowest_retirable_way_as_suspect(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
686     cmd_Lxcache_t *anonymous_Lxcache, const char *fltnm)
687 {
688 	/*
689 	 * This routine is called only when handling anonymous TAG or DATA
690 	 * errors. When we exit this routine we would have destroyed the
691 	 * anonymous_Lxcache structure that was passed to us and created
692 	 * a new Lxcache if we were successful in determining a way to retire.
693 	 */
694 	int8_t	lowest_retirable_way, ways_retired;
695 	int32_t	reason;
696 	cmd_ptrsubtype_t type;
697 	cmd_Lxcache_t *new_Lxcache;
698 
699 	ways_retired = get_index_retired_ways(cpu,
700 	    anonymous_Lxcache->Lxcache_type,
701 	    anonymous_Lxcache->Lxcache_index);
702 	if (ways_retired == -1) {
703 		/*
704 		 * Couldn't determine how many ways have been retired at this
705 		 * index. Destroy the anonymous_Lxcache and return failure.
706 		 */
707 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
708 		return (CMD_EVD_BAD);
709 	}
710 	/*
711 	 * Before retiring a way check if we have already
712 	 * retired 3 ways for this index.
713 	 * For TAG errors we will not perform this check because
714 	 * we could reretire cachlines retired for DATA errors.
715 	 * The get_lowest_retirable_way() will ensure that we do
716 	 * not end up retiring all 4 ways.
717 	 */
718 	if (!IS_TAG(anonymous_Lxcache->Lxcache_type)) {
719 		if (ways_retired >= 3) {
720 			fmd_hdl_debug(hdl,
721 			    "\n%s: cpu %d: num of ways retired for index %d"
722 			    " is %d will fault the CPU\n",
723 			    fltnm, cpu->cpu_cpuid,
724 			    anonymous_Lxcache->Lxcache_index, ways_retired);
725 			type = anonymous_Lxcache->Lxcache_type;
726 			/*
727 			 * destroy the anonymous_Lxcache
728 			 */
729 			cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
730 			cmd_fault_the_cpu(hdl, cpu, type, fltnm);
731 			return (CMD_EVD_OK);
732 		}
733 	}
734 	/*
735 	 * No ways have been retired as "SUSPECT" for this bit.
736 	 * We need to retire the lowest unretired way as suspect.
737 	 */
738 	fmd_hdl_debug(hdl,
739 	    "\n%s: cpu_id %d Checking for the lowest retirable"
740 	    " way at index %d\n",
741 	    fltnm, cpu->cpu_cpuid, anonymous_Lxcache->Lxcache_index);
742 	lowest_retirable_way = cmd_Lxcache_get_lowest_retirable_way(cpu,
743 	    anonymous_Lxcache->Lxcache_index, anonymous_Lxcache->Lxcache_type);
744 	if (lowest_retirable_way != -1) {
745 		fmd_hdl_debug(hdl,
746 		    "\n%s: cpu_id %d lowest retirable way is %d\n",
747 		    fltnm, cpu->cpu_cpuid, lowest_retirable_way);
748 		anonymous_Lxcache->Lxcache_way = lowest_retirable_way;
749 		new_Lxcache = cmd_create_and_destroy_Lxcache(hdl, cpu,
750 		    anonymous_Lxcache);
751 		if ((new_Lxcache == NULL) ||
752 		    (new_Lxcache->Lxcache_case.cc_cp == NULL)) {
753 			return (CMD_EVD_BAD);
754 		}
755 		if (IS_TAG(new_Lxcache->Lxcache_type))
756 			reason = CMD_LXSUSPECT_0_TAG;
757 		else
758 			reason = CMD_LXSUSPECT_DATA;
759 		return (cmd_Lxcache_retire_as_reason(hdl, cpu, new_Lxcache,
760 		    fltnm, reason));
761 	} else {
762 		fmd_hdl_debug(hdl,
763 		    "\n%s:cpu_id %d we are unable to determine which"
764 		    " way is faulty at cache index %d."
765 		    " Will retire the CPU.\nRecommended-Action:"
766 		    " Service action required\n",
767 		    fltnm, cpu->cpu_cpuid, anonymous_Lxcache->Lxcache_index);
768 		type = anonymous_Lxcache->Lxcache_type;
769 		/*
770 		 * destroy the anonymous_Lxcache
771 		 */
772 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
773 		cmd_fault_the_cpu(hdl, cpu, type, fltnm);
774 		return (CMD_EVD_OK);
775 	}
776 }
777 
778 int
unretire_suspect_and_retire_next_retirable_way(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * suspect_Lxcache,cmd_Lxcache_t * anonymous_Lxcache,const char * fltnm)779 unretire_suspect_and_retire_next_retirable_way(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
780     cmd_Lxcache_t *suspect_Lxcache, cmd_Lxcache_t *anonymous_Lxcache,
781     const char *fltnm)
782 {
783 	int8_t	retired_way, next_retirable_way;
784 	int32_t	retired_index;
785 	cmd_ptrsubtype_t retired_type;
786 	int32_t	reason;
787 	cmd_Lxcache_t *new_Lxcache;
788 
789 	/*
790 	 * This routine is called only when handling anonymous TAG or DATA
791 	 * errors. When we exit this routine we would have destroyed the
792 	 * anonymous_Lxcache structure that was passed to us.
793 	 */
794 	fmd_hdl_debug(hdl,
795 	    "\n%s:cpu_id %d found index %d way %d"
796 	    " bit %d retired as %s. Will unretire this now.\n",
797 	    fltnm, cpu->cpu_cpuid, suspect_Lxcache->Lxcache_index,
798 	    suspect_Lxcache->Lxcache_way, suspect_Lxcache->Lxcache_bit,
799 	    cmd_reason_to_str(suspect_Lxcache->Lxcache_reason));
800 	/*
801 	 * Save the way because we will destroy the
802 	 * suspect_Lxcache after we successfully unretire it.
803 	 */
804 	retired_way = suspect_Lxcache->Lxcache_way;
805 	retired_index = suspect_Lxcache->Lxcache_index;
806 	retired_type = suspect_Lxcache->Lxcache_type;
807 	/*
808 	 * unretire the retired_way.
809 	 */
810 	if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache,
811 	    fltnm)
812 	    == B_TRUE) {
813 		suspect_Lxcache->Lxcache_reason =
814 		    CMD_LXFUNCTIONING;
815 		fmd_hdl_debug(hdl,
816 		    "\n%s:cpu_id %d index %d way %d"
817 		    " successfully unretired. Will"
818 		    " destroy this Lxcache now.\n",
819 		    fltnm, cpu->cpu_cpuid, suspect_Lxcache->Lxcache_index,
820 		    suspect_Lxcache->Lxcache_way);
821 		cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
822 	} else {
823 		/*
824 		 * destroy the anonymous_Lxcache
825 		 */
826 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
827 		return (CMD_EVD_BAD);
828 	}
829 	/*
830 	 * retire the next retirable way
831 	 */
832 	next_retirable_way = cmd_Lxcache_get_next_retirable_way(cpu,
833 	    retired_index,
834 	    retired_type, retired_way);
835 	if (next_retirable_way == -1) {
836 		/*
837 		 * There is no retirable way that is next to the
838 		 * one we just retired. We need to offline the
839 		 * CPU since we are unable to determine which
840 		 * way is reporting the errors.
841 		 */
842 		fmd_hdl_debug(hdl,
843 		    "\n%s:cpu_id %d we are unable to determine"
844 		    " which way is faulty at cache index %d."
845 		    " It is likely that we have a leaky bit"
846 		    " that gets corrected.\n Will retire"
847 		    " the CPU.\nRecommended-Action: Service"
848 		    " action required\n",
849 		    fltnm, cpu->cpu_cpuid, retired_index);
850 		/*
851 		 * destroy the anonymous_Lxcache
852 		 */
853 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
854 		cmd_fault_the_cpu(hdl, cpu, retired_type, fltnm);
855 		return (CMD_EVD_OK);
856 	} else {
857 		fmd_hdl_debug(hdl,
858 		    "\n%s:cpu_id %d found way %d at index %d to"
859 		    " retire as SUSPECT_0/SUSPECT_DATA\n",
860 		    fltnm, cpu->cpu_cpuid, next_retirable_way, retired_index);
861 		/*
862 		 * We need to create a new Lxcache struture.
863 		 * The existing Lxcache is for anonymous way.
864 		 */
865 		anonymous_Lxcache->Lxcache_way = next_retirable_way;
866 		new_Lxcache = cmd_create_and_destroy_Lxcache(hdl,
867 		    cpu, anonymous_Lxcache);
868 		if ((new_Lxcache == NULL) ||
869 		    (new_Lxcache->Lxcache_case.cc_cp == NULL)) {
870 			return (CMD_EVD_BAD);
871 		}
872 		if (IS_TAG(new_Lxcache->Lxcache_type))
873 			reason = CMD_LXSUSPECT_0_TAG;
874 		else
875 			reason = CMD_LXSUSPECT_DATA;
876 		return (cmd_Lxcache_retire_as_reason(hdl, cpu, new_Lxcache,
877 		    fltnm, reason));
878 	}
879 }
880 
881 void
find_and_destroy_anonymous_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index)882 find_and_destroy_anonymous_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
883     cmd_ptrsubtype_t pstype, int32_t index)
884 {
885 	cmd_Lxcache_t *anonymous_Lxcache;
886 	const char	*fltnm;
887 
888 	fltnm = cmd_type_to_str(pstype);
889 	anonymous_Lxcache =
890 	    cmd_Lxcache_lookup_by_type_index_way_bit(cpu,
891 	    pstype, index, -1, -1);
892 	if (anonymous_Lxcache != NULL) {
893 		fmd_hdl_debug(hdl,
894 		    "\n%s:cpu_id = %d index = %d We are destroying the"
895 		    " anonymous Lxcache now.\n",
896 		    fltnm, cpu->cpu_cpuid, index);
897 		/*
898 		 * Free the resources allocated to handle
899 		 * recheck_of_tags. Delete the Lxcache.
900 		 */
901 		cmd_Lxcache_destroy(hdl, cpu,
902 		    anonymous_Lxcache);
903 	}
904 }
905 
906 void
cmd_Lxcache_anonymous_tag_error_timeout(fmd_hdl_t * hdl,id_t id)907 cmd_Lxcache_anonymous_tag_error_timeout(fmd_hdl_t *hdl, id_t id)
908 {
909 	cmd_Lxcache_t	*Lxcache;
910 	const char	*class;
911 
912 
913 	/*
914 	 * We search thru the entire Lxcache structures to find
915 	 * a matching id.
916 	 */
917 	Lxcache = cmd_Lxcache_lookup_by_timeout_id(id);
918 	if (Lxcache == NULL) {
919 		fmd_hdl_debug(hdl,
920 		    "Could not find Lxcache for timeout_id 0x%x\n", id);
921 		return;
922 	}
923 	fmd_hdl_debug(hdl,
924 	    "\n%s:anonymous_tag_error_timeout:index = %d\n",
925 	    cmd_type_to_str(Lxcache->Lxcache_type),
926 	    Lxcache->Lxcache_index);
927 	/*
928 	 * Set timeout_id to -1 to indicate that we have processed the
929 	 * timeout.
930 	 */
931 	Lxcache->Lxcache_timeout_id = -1;
932 	switch (Lxcache->Lxcache_type) {
933 		case CMD_PTR_CPU_L2TAG:
934 			class = "ereport.cpu.ultraSPARC-IVplus.thce";
935 			(void) cmd_txce(hdl, Lxcache->Lxcache_ep,
936 			    Lxcache->Lxcache_nvl,
937 			    class, Lxcache->Lxcache_clcode);
938 			break;
939 		case CMD_PTR_CPU_L3TAG:
940 			class = "ereport.cpu.ultraSPARC-IVplus.l3-thce";
941 			(void) cmd_l3_thce(hdl, Lxcache->Lxcache_ep,
942 			    Lxcache->Lxcache_nvl,
943 			    class, Lxcache->Lxcache_clcode);
944 			break;
945 		default:
946 			fmd_hdl_debug(hdl,
947 			    "Unexpected pstype 0x%x found in"
948 			    " anonymous_tag_error_timeout: index = %d\n",
949 			    Lxcache->Lxcache_type,
950 			    Lxcache->Lxcache_index);
951 			return;
952 	}
953 }
954 
955 cmd_evdisp_t
cmd_us4plus_tag_err(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,const char * serdn,const char * serdt,const char * fltnm,cmd_errcl_t clcode)956 cmd_us4plus_tag_err(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
957 		cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
958 		const char *serdn, const char *serdt,
959 		const char *fltnm, cmd_errcl_t clcode)
960 {
961 	uint64_t	tag_afar;
962 	int32_t	index;
963 	int8_t		way;
964 	int16_t		tag_bit, bit, sticky_bit;
965 	cmd_Lxcache_t	*Lxcache, *suspect_Lxcache, *retired_Lxcache;
966 	cmd_Lxcache_t	*anonymous_Lxcache;
967 	uint64_t	tag_synd;
968 	uint64_t	tag_data[PN_CACHE_NWAYS];
969 	uint8_t		state;
970 	int		ways_retired, ret;
971 	int		retries_for_ecc_match;
972 	int32_t		recheck_of_tags;
973 	int		way_already_retired = 0;
974 
975 	/*
976 	 * We now extract physical tags and states
977 	 * and also look for matching ECC on all 4 ways.
978 	 */
979 	ret = extract_data_from_ereport_payload(hdl, nvl, cpu, pstype,
980 	    &tag_afar, tag_data, fltnm);
981 	if (ret != 0)
982 		return (ret);
983 	index = get_index(pstype, tag_afar);
984 	retries_for_ecc_match = 0;
985 	while (matching_ecc(tag_data) != 0) {
986 		if (retries_for_ecc_match >= MAX_RETRIES_FOR_ECC_MATCH)
987 			return (CMD_EVD_BAD);
988 		print_ecc(hdl, cpu, fltnm, tag_data);
989 		fmd_hdl_debug(hdl,
990 		    "\n%s:cpu_id = %d index = %d ECCs don't match.\n"
991 		    "Reading tag info again.\n",
992 		    fltnm, cpu->cpu_cpuid, index);
993 		(void) get_tagdata(cpu, pstype, index, tag_data);
994 		retries_for_ecc_match++;
995 	}
996 	ways_retired = get_retired_ways(tag_data);
997 	fmd_hdl_debug(hdl,
998 	    "\n%s:cpu_id %d: found %d ways retired at the index %d\n",
999 	    fltnm, cpu->cpu_cpuid, ways_retired, index);
1000 	tag_synd = compute_syndrome(tag_data, pstype);
1001 	ret = nvlist_lookup_int32(nvl, FM_EREPORT_RECHECK_OF_TAGS,
1002 	    &recheck_of_tags);
1003 	if (ret != CMD_EVD_OK) {
1004 		fmd_hdl_debug(hdl,
1005 		    "ret value = %d for nvlist_lookup of recheck_of_tags\n",
1006 		    ret);
1007 		recheck_of_tags = 0;
1008 	}
1009 	if (tag_synd == 0) {
1010 		/*
1011 		 * The bit has been corrected by writeback, we will
1012 		 * first check if we are processing the re-check of tags
1013 		 * that we scheduled thru the timeout call.
1014 		 * if so we will exit if we reached the max retries.
1015 		 * Else we start a timeout and exit.
1016 		 * We will create a Lxcache structure for this index with way
1017 		 * as -1 and bit as -1. We will also keep a count of
1018 		 * attempts we made to check the tag data at this index.
1019 		 *
1020 		 */
1021 		way = -1;
1022 		bit = -1;
1023 		Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(cpu, pstype,
1024 		    index, way, bit);
1025 		if (recheck_of_tags) {
1026 			/*
1027 			 * We are processing the re-read of tags scheduled by
1028 			 * timeout. Exit if retry limit has been
1029 			 * reached. Else start another timeout.
1030 			 */
1031 			if (Lxcache == NULL) {
1032 				/*
1033 				 * This shouldn't happen.
1034 				 */
1035 				fmd_hdl_debug(hdl,
1036 				    "\n%s: cpu_id = %d failed to lookup"
1037 				    " index = %d way %d bit %d\n",
1038 				    fltnm, cpu->cpu_cpuid, index, way, bit);
1039 				return (CMD_EVD_BAD);
1040 			}
1041 			fmd_hdl_debug(hdl,
1042 			    "\n%s: cpu_id = %d index = %d syndrome"
1043 			    " computed is 0 in attempt #%d.\n",
1044 			    fltnm, cpu->cpu_cpuid, index,
1045 			    Lxcache->Lxcache_retry_count);
1046 			if (Lxcache->Lxcache_retry_count >=
1047 			    RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO) {
1048 				/*
1049 				 * We free only the nvl list here.
1050 				 * anonymous SERD engine will be freed
1051 				 * when the Lxcache gets destroyed.
1052 				 * We need the anonymous SERD engine still
1053 				 * because it has the event ep.
1054 				 * reset or destroy of SERD engine frees the
1055 				 * event ep.
1056 				 */
1057 				if (Lxcache->Lxcache_nvl != NULL) {
1058 					nvlist_free(Lxcache->Lxcache_nvl);
1059 					Lxcache->Lxcache_nvl = NULL;
1060 				}
1061 				fmd_hdl_debug(hdl,
1062 		    "\n%s:cpu_id %d Max retry count reached. Giving up.\n",
1063 				    fltnm, cpu->cpu_cpuid);
1064 				Lxcache->Lxcache_timeout_id = -1;
1065 				Lxcache->Lxcache_retry_count = 0;
1066 				goto process_after_finding_way_bit;
1067 			} else {
1068 				Lxcache->Lxcache_retry_count++;
1069 				Lxcache->Lxcache_timeout_id =
1070 				    fmd_timer_install(hdl,
1071 				    (void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR,
1072 				    NULL,
1073 				    (cmd_Lxcache_recheck_tags_delay[
1074 				    Lxcache->Lxcache_retry_count] * NANOSEC));
1075 				return (CMD_EVD_OK);
1076 			}
1077 		}
1078 		/*
1079 		 * Check if we already have a Lxcache structure
1080 		 * with anonymous way and bit created.
1081 		 */
1082 		if (Lxcache == NULL) {
1083 			Lxcache = cmd_Lxcache_create(hdl, 0, cpu,
1084 			    cpu->cpu_asru_nvl, pstype, index, way, bit);
1085 			if (Lxcache == NULL) {
1086 				fmd_hdl_debug(hdl,
1087 				    "\n%s:cpu_id %d Failed to create Lxcache"
1088 				    " for index=%d\n",
1089 				    fltnm, cpu->cpu_cpuid, index);
1090 				return (CMD_EVD_BAD);
1091 			}
1092 		}
1093 		if (Lxcache->Lxcache_timeout_id != -1) {
1094 			/*
1095 			 * We have another syndrome = 0 condition while we are
1096 			 * still in the process of retrying for the previous
1097 			 * condition.
1098 			 */
1099 			fmd_hdl_debug(hdl,
1100 			    "\n%s: cpu_id = %d index = %d We have another"
1101 			    " syndrome = 0 condition while we have already"
1102 			    " scheduled a timeout. We will ignore this"
1103 			    " event.\n",
1104 			    fltnm, cpu->cpu_cpuid, index);
1105 			return (CMD_EVD_OK);
1106 		}
1107 		fmd_hdl_debug(hdl,
1108 		    "\n%s: cpu_id = %d index = %d syndrome computed is 0."
1109 		    "Looks like the bit got corrected."
1110 		    " Will check later to see if it is OK.\n",
1111 		    fltnm, cpu->cpu_cpuid, index);
1112 		/*
1113 		 * We need to store the following arguments passed to
1114 		 * this function(tag_error_handler) so that we can
1115 		 * invoke this function from timeout routine.
1116 		 *
1117 		 * nvl, ep, clcode
1118 		 */
1119 		if (Lxcache->Lxcache_nvl == NULL) {
1120 			if (nvlist_dup(nvl, &Lxcache->Lxcache_nvl, 0) != 0) {
1121 				fmd_hdl_debug(hdl,
1122 				    "\n%s:cpu_id %d Failed to duplicate nvl"
1123 				    " for index=%d\n",
1124 				    fltnm, cpu->cpu_cpuid, index);
1125 				return (CMD_EVD_BAD);
1126 			}
1127 			if (nvlist_add_int32(Lxcache->Lxcache_nvl,
1128 			    FM_EREPORT_RECHECK_OF_TAGS, 1) != 0) {
1129 				fmd_hdl_debug(hdl,
1130 				    "\n%s:cpu_id %d Failed to add"
1131 				    " RECHECK_OF_TAGS in nvl for index=%d\n",
1132 				    fltnm, cpu->cpu_cpuid, index);
1133 				return (CMD_EVD_BAD);
1134 			}
1135 		}
1136 		/*
1137 		 * We are called with CMP_CPU_LEVEL_CORE masked out
1138 		 * from cmd_txce(), cmd_l3_thce() routines.
1139 		 * We need to set CMD_CPU_LEVEL_CORE because we want to handle
1140 		 * both the cores on the Chip as one single cpu_id.
1141 		 */
1142 		Lxcache->Lxcache_clcode = (clcode | CMD_CPU_LEVEL_CORE);
1143 		if (Lxcache->Lxcache_ep == NULL) {
1144 			Lxcache->Lxcache_ep = ep;
1145 			/*
1146 			 * we need to preserve the event ep so that it does
1147 			 * not get destroyed when we return from this call.
1148 			 * We do that by adding the event ep to the SERD engine.
1149 			 * The SERD engine we create is different from the one
1150 			 * we create when we handle the actual event at label
1151 			 * process_after_finding_way_bit.
1152 			 */
1153 			Lxcache->Lxcache_serdnm =
1154 			    cmd_Lxcache_anonymous_serdnm_create(hdl,
1155 			    cpu->cpu_cpuid, pstype, index,
1156 			    way, bit);
1157 			if (!fmd_serd_exists(hdl, Lxcache->Lxcache_serdnm)) {
1158 				fmd_serd_create(hdl, Lxcache->Lxcache_serdnm,
1159 				    fmd_prop_get_int32(hdl, serdn),
1160 				    fmd_prop_get_int64(hdl, serdt));
1161 				fmd_hdl_debug(hdl,
1162 				    "\n%s: cpu_id %d: created a SERD engine"
1163 				    " %s\n",
1164 				    fltnm, cpu->cpu_cpuid,
1165 				    Lxcache->Lxcache_serdnm);
1166 			}
1167 			(void) fmd_serd_record(hdl,
1168 			    Lxcache->Lxcache_serdnm,
1169 			    ep);
1170 		}
1171 		Lxcache->Lxcache_retry_count++;
1172 		Lxcache->Lxcache_timeout_id =
1173 		    fmd_timer_install(hdl,
1174 		    (void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR, NULL,
1175 		    (cmd_Lxcache_recheck_tags_delay[
1176 		    Lxcache->Lxcache_retry_count] * NANOSEC));
1177 		return (CMD_EVD_OK);
1178 
1179 	} else {
1180 		/*
1181 		 * tag_synd != 0
1182 		 * determine way and bit
1183 		 */
1184 		tag_bit = ecc_syndrome_tab[tag_synd & 0x1ff];
1185 		fmd_hdl_debug(hdl,
1186 		    "\n%s: cpu_id = %d index = %d tag_bit %03d is faulty.\n",
1187 		    fltnm, cpu->cpu_cpuid, index, tag_bit);
1188 		if ((tag_bit > C8)) {
1189 			fmd_hdl_debug(hdl, "%s: cpu_id = %d"
1190 			    " Unexpected MTAG or Multiple bit error detected\n",
1191 			    fltnm, cpu->cpu_cpuid);
1192 			find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype,
1193 			    index);
1194 			return (CMD_EVD_BAD);
1195 		}
1196 		if ((tag_bit >= C0) && (tag_bit <= C8)) {
1197 			/*
1198 			 * ECC bit is corrupted.
1199 			 * Need to offline the CPU
1200 			 */
1201 			bit = (tag_bit - C0) + PN_LX_TAG_ECC_START_BIT;
1202 			way = 0;
1203 			fmd_hdl_debug(hdl,
1204 			    "\n%s: cpu_id = %d ECC bit is faulty.\n",
1205 			    fltnm, cpu->cpu_cpuid);
1206 		} else {
1207 			bit = tag_bit_to_way_bit(pstype, tag_bit);
1208 			way = bit_to_way(pstype, tag_bit);
1209 			if (way < 0) {
1210 				fmd_hdl_debug(hdl,
1211 				    "\n%s: cpu_id = %d %d bit indicted is a"
1212 				    " meta bit  !!\n",
1213 				    fltnm, cpu->cpu_cpuid, bit);
1214 				find_and_destroy_anonymous_Lxcache(hdl, cpu,
1215 				    pstype,
1216 				    index);
1217 				return (CMD_EVD_BAD);
1218 			}
1219 		}
1220 	}	/* end of tag_synd != 0 */
1221 process_after_finding_way_bit:
1222 	if ((Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(cpu, pstype,
1223 	    index, way,
1224 	    bit)) != NULL &&
1225 	    Lxcache->Lxcache_case.cc_cp != NULL &&
1226 	    fmd_case_solved(hdl, Lxcache->Lxcache_case.cc_cp)) {
1227 		fmd_hdl_debug(hdl,
1228 		    "\n%s:cpu %d: the case for %s is already solved.\n",
1229 		    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_bufname);
1230 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1231 		return (CMD_EVD_REDUND);
1232 	}
1233 
1234 	if (Lxcache == NULL)
1235 		Lxcache = cmd_Lxcache_create(hdl, 0, cpu, cpu->cpu_asru_nvl,
1236 		    pstype, index, way, bit);
1237 	if (Lxcache == NULL) {
1238 		fmd_hdl_debug(hdl,
1239 		    "\n%s:cpu %d: Failed to create Lxcache for index %d",
1240 		    " way %d bit %d\n",
1241 		    fltnm, cpu->cpu_cpuid, index, way, bit);
1242 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1243 		return (CMD_EVD_BAD);
1244 	}
1245 	if (cmd_create_case_for_Lxcache(hdl, cpu, Lxcache) == B_FALSE) {
1246 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1247 		return (CMD_EVD_BAD);
1248 	}
1249 	if (Lxcache->Lxcache_case.cc_serdnm == NULL) {
1250 		Lxcache->Lxcache_case.cc_serdnm = cmd_Lxcache_serdnm_create(hdl,
1251 		    cpu->cpu_cpuid, pstype, index,
1252 		    way, bit);
1253 		if (!fmd_serd_exists(hdl, Lxcache->Lxcache_case.cc_serdnm)) {
1254 			fmd_serd_create(hdl, Lxcache->Lxcache_case.cc_serdnm,
1255 			    fmd_prop_get_int32(hdl, serdn),
1256 			    fmd_prop_get_int64(hdl, serdt));
1257 			fmd_hdl_debug(hdl,
1258 			    "\n%s: cpu_id %d: created a SERD engine %s\n",
1259 			    fltnm, cpu->cpu_cpuid,
1260 			    Lxcache->Lxcache_case.cc_serdnm);
1261 		}
1262 	}
1263 	fmd_hdl_debug(hdl,
1264 	    "\n%s:cpu_id %d: Checking if the SERD engine %s has fired.\n",
1265 	    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1266 
1267 	(void) fmd_serd_record(hdl, Lxcache->Lxcache_case.cc_serdnm, ep);
1268 	if (way >= 0) {
1269 		/*
1270 		 * Now that we have recorded the event ep we can do the
1271 		 * necessary cleanup of resources allocated for recheck of tags.
1272 		 */
1273 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1274 	}
1275 	if (fmd_serd_fired(hdl, Lxcache->Lxcache_case.cc_serdnm) ==
1276 	    FMD_B_FALSE)
1277 		return (CMD_EVD_OK);
1278 
1279 	fmd_hdl_debug(hdl, "\n%s: cpu_id = %d creating fault %s\n",
1280 	    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1281 	fmd_case_add_serd(hdl, Lxcache->Lxcache_case.cc_cp,
1282 	    Lxcache->Lxcache_case.cc_serdnm);
1283 	fmd_serd_reset(hdl, Lxcache->Lxcache_case.cc_serdnm);
1284 	if (way == -1) {
1285 		/*
1286 		 * The assignment below is to make the code easier to maintain.
1287 		 * We need to destroy the anonymous_Lxcache after we have
1288 		 * identifed a way to retire. If we cannot detrmine a way to
1289 		 * retire we will destrory the anonymous_Lxcache and fault the
1290 		 * cpu.
1291 		 */
1292 		anonymous_Lxcache = Lxcache;
1293 		/*
1294 		 * Anonymous TAG way retirement.
1295 		 * - if a way at this index has already been retired as
1296 		 *   "suspect-1", unretire that way, and retire the next
1297 		 *   unretired way as "suspect-0", using a pattern of all zeros
1298 		 *   for the PA bits.
1299 		 * - if a way at this index has already been retired as
1300 		 *   "suspect-0", re-retire that way as "suspect-1", using a
1301 		 *   pattern of all ones for the PA bits.
1302 		 * - if no ways have been retired as "suspect" for this index,
1303 		 *   retire the lowest unretired way as "suspect-0" for this
1304 		 *   bit, using a pattern of all zeros for the PA bits.
1305 		 * - if there is no next retirable way, fault the CPU.
1306 		 */
1307 		suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1308 		    cpu, pstype, index, bit, CMD_LXSUSPECT_1_TAG);
1309 		anonymous_Lxcache->Lxcache_ep = ep;
1310 		if (suspect_Lxcache) {
1311 			ret = unretire_suspect_and_retire_next_retirable_way(
1312 			    hdl, cpu, suspect_Lxcache, anonymous_Lxcache,
1313 			    fltnm);
1314 			return (ret);
1315 		}	/* end SUSPECT_1_TAG */
1316 		suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1317 		    cpu, pstype, index, bit, CMD_LXSUSPECT_0_TAG);
1318 		if (suspect_Lxcache) {
1319 			fmd_hdl_debug(hdl,
1320 			    "\n%s:cpu_id %d found index %d way %d"
1321 			    " bit %d retired as SUSPECT_0_TAG. Will"
1322 			    " re-retire this now as SUSPECT_1_TAG.\n",
1323 			    fltnm, cpu->cpu_cpuid, index,
1324 			    suspect_Lxcache->Lxcache_way, bit);
1325 			/*
1326 			 * destroy the anonymous_Lxcache
1327 			 */
1328 			cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
1329 			suspect_Lxcache->Lxcache_ep = ep;
1330 			/*
1331 			 * We need to update the FM_FMRI_CPU_CACHE_BIT entry
1332 			 * in the Lxcache_asru_nvl. This entry was last updated
1333 			 * when the cacheline was retired as SUSPECT_0.
1334 			 * Therefore the MSB of FM_FMRI_CPU_CACHE_BIT entry
1335 			 * value will be reset. To retire cacheline as
1336 			 * SUSPECT_1 the MSB has to be set.
1337 			 */
1338 			errno = nvlist_add_uint16(
1339 			    suspect_Lxcache->Lxcache_asru_nvl,
1340 			    FM_FMRI_CPU_CACHE_BIT,
1341 			    suspect_Lxcache->Lxcache_bit);
1342 			if (errno) {
1343 				fmd_hdl_debug(hdl,
1344 				    "\n%s:cpu_id %d: failed to update",
1345 				    " CACHE_BIT in asru.\n",
1346 				    fltnm, cpu->cpu_cpuid);
1347 			}
1348 			return (cmd_Lxcache_retire_as_reason(hdl, cpu,
1349 			    suspect_Lxcache, fltnm, CMD_LXSUSPECT_1_TAG));
1350 		}	/* end of SUSPECT_0_TAG */
1351 		/*
1352 		 * No ways have been retired as "SUSPECT_x" for this bit.
1353 		 * We need to retire the lowest unretired way as suspect.
1354 		 */
1355 		ret = retire_lowest_retirable_way_as_suspect(hdl, cpu,
1356 		    anonymous_Lxcache,
1357 		    fltnm);
1358 		return (ret);
1359 	}	/* End of Anonymous TAG retirement */
1360 	/*
1361 	 * Identified bit and way has fired.
1362 	 * - Destroy any anonymous SERD engine at that index.
1363 	 * - If the bad bit is an ECC bit, fault the CPU.
1364 	 * - If the way was already convicted due to tag errors, fault the CPU.
1365 	 * - If the bad bit is a state bit, then:
1366 	 * - if the stable value of the bad bit will hold the NA encoding,
1367 	 *   retire the containing way as "convicted".
1368 	 * - if the stable value of the bad bit will not hold the NA
1369 	 *   encoding, fault the CPU.
1370 	 */
1371 	cmd_Lxcache_destroy_anonymous_serd_engines(hdl, cpu, pstype, index, -1);
1372 	sticky_bit = find_bit_stickiness(tag_data, way, bit);
1373 	if ((bit >= PN_LX_TAG_ECC_START_BIT) &&
1374 	    (bit <= PN_LX_TAG_ECC_END_BIT)) {
1375 		fmd_hdl_debug(hdl,
1376 		    "\n%s:cpu_id %d Bad ECC bit %d at cache index %d way %d"
1377 		    " detected. Will offline the CPU.\n",
1378 		    fltnm, cpu->cpu_cpuid, bit, index, way);
1379 		cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1380 		return (CMD_EVD_OK);
1381 	}
1382 	/*
1383 	 * Check if a STATE bit is faulty.
1384 	 * If so we need to ensure that we will be able to
1385 	 * make the way NA, else fault the CPU.
1386 	 */
1387 	if (bit <= PN_LX_STATE_END_BIT) {
1388 		fmd_hdl_debug(hdl,
1389 		    "%s cpu_id = %d: STATE bit %d is faulty.\n",
1390 		    fltnm, cpu->cpu_cpuid, bit);
1391 		/*
1392 		 * If the stable value of bit will hold the NA encoding
1393 		 * retire the containing way Else fault the cpu.
1394 		 */
1395 		state = tag_data[way] & CH_ECSTATE_MASK;
1396 		if ((state & (1 << bit)) != (PN_ECSTATE_NA & (1 << bit))) {
1397 			/*
1398 			 * The stable value of the bad bit will not hold the
1399 			 * NA encoding. will fault the CPU.
1400 			 */
1401 			fmd_hdl_debug(hdl,
1402 			    "\n%s:cpu_id %d STATE bit %d is faulty at"
1403 			    " cache index %d way %d. STATE = 0x%x\n"
1404 			    " The bad bit will not hold the encoding we need"
1405 			    " to mark the cacheline as retired, so will offline"
1406 			    " the CPU.\n",
1407 			    fltnm, cpu->cpu_cpuid, bit, index, way, state);
1408 			cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1409 			return (CMD_EVD_OK);
1410 		}
1411 	}
1412 	/*
1413 	 * Check if we are getting fault on a way that is already retired.
1414 	 * if the way was already convicted due to tag errors, fault the CPU.
1415 	 * Note that the way could have previously been retired due to
1416 	 * data errors.  This is okay; we just re-retire it due to tag errors,
1417 	 * so that we can write the offending tag bit to a stable value.
1418 	 */
1419 	if ((tag_data[way] & CH_ECSTATE_MASK) == PN_ECSTATE_NA) {
1420 		/*
1421 		 * Looking for CONVICTED TAG fault first.
1422 		 * If found retire the CPU.
1423 		 */
1424 		retired_Lxcache = cmd_Lxcache_lookup_by_type_index_way_reason(
1425 		    cpu, pstype, index, way, CMD_LXCONVICTED);
1426 		if (retired_Lxcache) {
1427 			fmd_hdl_debug(hdl,
1428 			    "\n%s: cpu %d: The cache index %d way %d previously"
1429 			    " retired for %s fault at bit %d is reporting"
1430 			    " fault. Will fault the CPU\n",
1431 			    fltnm, cpu->cpu_cpuid, index, way,
1432 			    cmd_type_to_str(
1433 			    retired_Lxcache->Lxcache_type),
1434 			    retired_Lxcache->Lxcache_bit);
1435 			cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1436 			return (CMD_EVD_OK);
1437 		}
1438 		way_already_retired = 1;
1439 	}
1440 	/*
1441 	 * If any way(Including the current way) at this index is retired as
1442 	 * "suspect" due to tag errors, unretire it.  (If that suspect way
1443 	 * really was bad, it will start producing errors again and will
1444 	 * eventually be retired again.)
1445 	 */
1446 	suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1447 	    cpu, pstype, index,  -1,
1448 	    (CMD_LXSUSPECT_0_TAG | CMD_LXSUSPECT_1_TAG));
1449 	if (suspect_Lxcache) {
1450 		fmd_hdl_debug(hdl,
1451 		    "\n%s:cpu_id %d found index %d way %d"
1452 		    " bit %d retired as SUSPECT_x. Will"
1453 		    "  unretire this now.\n",
1454 		    fltnm, cpu->cpu_cpuid, index,
1455 		    suspect_Lxcache->Lxcache_way, -1);
1456 		/*
1457 		 * unretire the suspect_x retired_way.
1458 		 */
1459 		if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache, fltnm)
1460 		    == B_TRUE) {
1461 			suspect_Lxcache->Lxcache_reason =
1462 			    CMD_LXFUNCTIONING;
1463 			fmd_hdl_debug(hdl,
1464 			    "\n%s:cpu_id %d index %d way %d"
1465 			    " successfully unretired. Will"
1466 			    " destroy this Lxcache now.\n",
1467 			    fltnm, cpu->cpu_cpuid, index,
1468 			    suspect_Lxcache->Lxcache_way);
1469 			cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
1470 		} else {
1471 			/*
1472 			 * We are unable to unretire the previously retired
1473 			 * SUSPECT way at the fault index.
1474 			 * If the previously retired way is same as the way
1475 			 * we are attempting to retire then return failure.
1476 			 */
1477 			if (suspect_Lxcache->Lxcache_way ==
1478 			    Lxcache->Lxcache_way)
1479 				return (CMD_EVD_BAD);
1480 		}
1481 	}
1482 	ways_retired = get_index_retired_ways(cpu, pstype, index);
1483 	if (ways_retired == -1)
1484 		return (CMD_EVD_BAD);
1485 	/*
1486 	 * Before retiring a way check if we have already
1487 	 * retired 3 ways for this index.
1488 	 * If the way was already retired due to DATA error or
1489 	 * SUSPECT_X TAG error then we skip the check.
1490 	 */
1491 	if (!way_already_retired) {
1492 		if (ways_retired >= 3) {
1493 			fmd_hdl_debug(hdl,
1494 			    "\n%s: cpu %d: num of ways retired for index %d"
1495 			    " is %d will fault the CPU\n",
1496 			    fltnm, cpu->cpu_cpuid, index, ways_retired);
1497 			cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1498 			return (CMD_EVD_OK);
1499 		}
1500 	}
1501 	fmd_hdl_debug(hdl,
1502 	    "\n%s: cpu %d: num of ways retired for index %d is %d\n",
1503 	    fltnm, cpu->cpu_cpuid, index, ways_retired);
1504 	if ((errno = nvlist_add_uint16(Lxcache->Lxcache_asru_nvl,
1505 	    FM_FMRI_CPU_CACHE_BIT,
1506 	    sticky_bit)) != 0 ||
1507 	    (errno = fmd_nvl_fmri_expand(hdl, Lxcache->Lxcache_asru_nvl)) != 0)
1508 		fmd_hdl_abort(hdl, "failed to build Lxcache fmri");
1509 	Lxcache->Lxcache_ep = ep;
1510 	return (cmd_Lxcache_retire_as_reason(hdl, cpu, Lxcache, fltnm,
1511 	    CMD_LXCONVICTED));
1512 }
1513 
1514 static boolean_t
pn_there_is_a_matching_synd(fmd_hdl_t * hdl,cmd_xr_t * xr)1515 pn_there_is_a_matching_synd(fmd_hdl_t *hdl, cmd_xr_t *xr)
1516 {
1517 	int ec_data_idx, i;
1518 	int8_t	way;
1519 	uint64_t ec_tag, data_hi, data_lo;
1520 	int ecc, calc_synd;
1521 	ec_data_elm_t *ecdptr = NULL;
1522 	uint8_t state;
1523 	ch_ec_data_t	*ecp;
1524 
1525 	ecp = (ch_ec_data_t *)(xr->xr_cache_data);
1526 	for (way = 0; way < xr->xr_num_ways; way++, ecp++) {
1527 		ec_tag = ecp->ec_tag;
1528 		/*
1529 		 * skip Retired and Invalid ways
1530 		 */
1531 		state = ec_tag & CH_ECSTATE_MASK;
1532 		if ((state == PN_ECSTATE_NA) ||
1533 		    (state == CH_ECSTATE_INV))
1534 			continue;
1535 		/*
1536 		 * Each 16 bytes of data are protected by 9-bit ECC field.
1537 		 */
1538 
1539 		for (i = 0; i < (CH_ECACHE_SUBBLK_SIZE/16); i++) {
1540 			ec_data_idx = (i/2);
1541 
1542 			ecdptr = &ecp->ec_data[ec_data_idx];
1543 			if ((i & 1) == 0) {
1544 				ecc = (ecdptr->ec_eccd >> 9) & 0x1ff;
1545 				data_hi = ecdptr->ec_d8[0];
1546 				data_lo = ecdptr->ec_d8[1];
1547 			} else {
1548 				ecc = ecdptr->ec_eccd & 0x1ff;
1549 				data_hi = ecdptr->ec_d8[2];
1550 				data_lo = ecdptr->ec_d8[3];
1551 			}
1552 
1553 			calc_synd = calcsynd(data_hi, data_lo, ecc);
1554 			if ((calc_synd != 0) &&
1555 			    (xr->xr_synd == calc_synd)) {
1556 				if (xr->xr_num_ways == 1) {
1557 					fmd_hdl_debug(hdl,
1558 			"\ncomputed syndrome matches with the reported syndrome"
1559 			" 0x%x index = %d way = %d\n",
1560 					    xr->xr_synd, xr->xr_error_index,
1561 					    xr->xr_error_way);
1562 				} else {
1563 					fmd_hdl_debug(hdl,
1564 					    "\ncomputed syndrome matches with"
1565 					    " the reported syndrome"
1566 					    " 0x%x index = %d way = %d\n",
1567 					    xr->xr_synd, xr->xr_error_index,
1568 					    way);
1569 					xr->xr_error_way = way;
1570 				}
1571 				return (B_TRUE);
1572 			}
1573 		}
1574 	}
1575 	return (B_FALSE);
1576 }
1577 
1578 /* add to cheetahregs.h */
1579 #define	CH_ECSTATE_NA 	5
1580 
1581 static int32_t
pn_extract_index(int32_t type,uint64_t afar)1582 pn_extract_index(int32_t type, uint64_t afar)
1583 {
1584 	int32_t index = -1;
1585 
1586 	switch (type) {
1587 		case CMD_PTR_CPU_L2DATA:
1588 			index = (int32_t)((afar & PN_L2_INDEX_MASK)
1589 			    >> PN_CACHE_LINE_SHIFT);
1590 			break;
1591 		case CMD_PTR_CPU_L3DATA:
1592 			index = (int32_t)((afar & PN_L3_INDEX_MASK)
1593 			    >> PN_CACHE_LINE_SHIFT);
1594 			break;
1595 	}
1596 	return (index);
1597 }
1598 
1599 /*
1600  *	cmd_cache_ce_panther
1601  *
1602  *	This routine handles L2 and L3 cachedata errors for the Panther.
1603  *	It's called when the train processing for L2 and L3 correctable
1604  *	data errors are about to issue a fault.
1605  *
1606  *	This routine retrieves payload information gathered during the XR
1607  *	processing and generates a unique SERD engine and cache data
1608  *	associated with the CPU if one does not exist.
1609  *	If the SERD fires for the given engine it will initiate a cache
1610  *	line fault if the way is not anonomyous.
1611  *	If the way is anonomyous, it will attempt to choose a way for the
1612  *	given index to fault. If the maximum for the index has not been
1613  *	reached, it will attempt to unretire a different way previously retired
1614  * 	under suspicion for the index prior to faulting
1615  *	the selected way.
1616  *	The routine will also fault the CPU if the maximum number of
1617  *	retired ways for the CPU has been exceeded based on the category.
1618  */
1619 /*ARGSUSED*/
1620 int
cmd_cache_ce_panther(fmd_hdl_t * hdl,fmd_event_t * ep,cmd_xr_t * xr)1621 cmd_cache_ce_panther(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_xr_t *xr)
1622 {
1623 	cmd_Lxcache_t *suspect_Lxcache, *Lxcache, *anonymous_Lxcache;
1624 	cmd_cpu_t *cpu = xr->xr_cpu;
1625 	cmd_case_t *cpu_cc;
1626 	cmd_ptrsubtype_t type;
1627 	const errdata_t *cache_ed;
1628 	uint16_t offset;
1629 	int16_t bit;
1630 	int	ways_retired;
1631 	int	ret;
1632 
1633 	/*
1634 	 * The caller of this routine cmd_xxc_hdlr() expects us to
1635 	 * return CMD_EVD_OK for success and CMD_EVD_BAD for failures.
1636 	 * If this is not a Panther or one of the Panther specific
1637 	 * errors that we handle here, then exit
1638 	 */
1639 
1640 	if (cpu->cpu_pers.cpup_type != CPU_ULTRASPARC_IVplus)
1641 		return (CMD_EVD_BAD);
1642 
1643 	if (!(xr->xr_clcode & (int)PN_CACHE_ERRORS))
1644 		return (CMD_EVD_BAD);
1645 
1646 
1647 	/* Set up Cache specific structs */
1648 
1649 	if (CMD_ERRCL_ISL2XXCU(xr->xr_clcode)) {
1650 		type = CMD_PTR_CPU_L2DATA;
1651 		cpu_cc = &cpu->cpu_l2data;
1652 		cache_ed = &l2errdata;
1653 	} else {
1654 		type = CMD_PTR_CPU_L3DATA;
1655 		cpu_cc = &cpu->cpu_l3data;
1656 		cache_ed = &l3errdata;
1657 	}
1658 
1659 	/* Ensure that our case is not solved */
1660 
1661 	if (cpu->cpu_faulting || (cpu_cc->cc_cp != NULL &&
1662 	    fmd_case_solved(hdl, cpu_cc->cc_cp)))
1663 			return (CMD_EVD_OK);
1664 
1665 	fmd_hdl_debug(hdl, "Processing Panther %s Error\n",
1666 	    cache_ed->ed_fltnm);
1667 
1668 	/* L3 errors arrive as mem scheme errors - convert to CPU */
1669 	if (type == CMD_PTR_CPU_L3DATA) {
1670 		cmd_fmri_init(hdl, &xr->xr_rsrc,
1671 		    xr->xr_detector_nvlist, "%s_rsrc",
1672 		    fmd_case_uuid(hdl, xr->xr_case));
1673 	}
1674 	bit = (uint8_t)ecc_syndrome_tab[xr->xr_synd];
1675 	offset = (uint16_t)xr->xr_afar & 0x3f;
1676 	if (bit > C8) {
1677 		fmd_hdl_debug(hdl, "xxC/LDxC dropped due to syndrome\n");
1678 		return (CMD_EVD_BAD);
1679 	}
1680 	if (bit < C0) {
1681 		/*
1682 		 * Data bit. Set bit in the range 0-511
1683 		 */
1684 		bit += ((3 - (offset/16)) * 128);
1685 	} else {
1686 		/*
1687 		 * ECC bit. Set bit in the range 512-547
1688 		 */
1689 		bit -= C0;
1690 		bit += 512 + ((3 - (offset/16)) * PN_LX_NUM_OF_BITS_IN_ECC);
1691 	}
1692 	xr->xr_error_index = pn_extract_index(type, xr->xr_afar);
1693 	if (xr->xr_error_index == 0xffffffff) {
1694 		fmd_hdl_debug(hdl, "xxC/LDxC dropped due to index\n");
1695 		return (CMD_EVD_BAD);
1696 	}
1697 	fmd_hdl_debug(hdl, "cpu_id: %d, syndrome: 0x%x, afar: 0x%llx\n",
1698 	    xr->xr_cpuid, xr->xr_synd, xr->xr_afar);
1699 	fmd_hdl_debug(hdl, "index: 0x%x(%d) bit: %d\n",
1700 	    xr->xr_error_index, xr->xr_error_index, bit);
1701 	/*
1702 	 * The payload information for the DATA errors are assembled
1703 	 * after first looking for a valid line that matches the fault AFAR.
1704 	 * If no match is found all 4 ways are logged and xr_num_ways
1705 	 * will be 4. If a matching way is found only that entry is logged
1706 	 * and xr_num_ways is set as 1.
1707 	 * The xr_error_way is set as -1 when xr_num_ways is 4, else
1708 	 * xr_error_way is set to the matching way.
1709 	 * what we do below is to force the xr_error_way to -1 for WDC/CPC
1710 	 * errors.
1711 	 * For UCC and EDC errors the xr_error_way will be set correctly.
1712 	 */
1713 
1714 	switch (xr->xr_clcode) {
1715 		case CMD_ERRCL_WDC:
1716 		case CMD_ERRCL_L3_WDC:
1717 			/*
1718 			 * WDC is a disrupting trap, and invalidates and
1719 			 * overwrites the problematic way.  Any match is due to
1720 			 * a refetch of the AFAR, which could have been to any
1721 			 * way. So these are treated as "anonymous".
1722 			 */
1723 			fmd_hdl_debug(hdl, "WDC fault detected\n");
1724 			xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1725 			break;
1726 		case CMD_ERRCL_CPC:
1727 		case CMD_ERRCL_L3_CPC:
1728 			/*
1729 			 * CPC is a disrupting trap, but since it happens due to
1730 			 * a snoop, the problematic way could become invalid,
1731 			 * overwritten by a different cache line, and then the
1732 			 * AFAR accessed and pulled into a different way,
1733 			 * causing a false positive match.  So it's best to not
1734 			 * look for a matching way and just ascribe these to
1735 			 *  the "anonymous" way.
1736 			 */
1737 			fmd_hdl_debug(hdl, "CPC fault detected\n");
1738 			xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1739 			break;
1740 		case CMD_ERRCL_UCC:
1741 		case CMD_ERRCL_L3_UCC:
1742 			/*
1743 			 * UCC is a precise trap, so, absent activity from the
1744 			 * other core, the tag address values read by the TL=1
1745 			 * trap handler are likely to be the same as those at
1746 			 * the time of the trap.
1747 			 * (A snoop from another CPU might cause a change in
1748 			 * state from valid to invalid, but the  tag address
1749 			 * won't change.) If we find a matching valid tag,
1750 			 * that identifies the way.
1751 			 */
1752 			fmd_hdl_debug(hdl, "UCC fault detected\n");
1753 			fmd_hdl_debug(hdl, "# of ways collected are %d\n",
1754 			    xr->xr_num_ways);
1755 			fmd_hdl_debug(hdl,
1756 			    "\n%s:cpu_id %d: error way = %d\n",
1757 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1758 			    xr->xr_error_way);
1759 			break;
1760 		case CMD_ERRCL_EDC:
1761 		case CMD_ERRCL_L3_EDC:
1762 			/*
1763 			 * EDC is a disrupting trap, but again if a matching
1764 			 * valid way is found, it is likely to be the correct
1765 			 * way.
1766 			 */
1767 			fmd_hdl_debug(hdl, "EDC fault detected\n");
1768 			fmd_hdl_debug(hdl, "# of ways collected are %d\n",
1769 			    xr->xr_num_ways);
1770 			fmd_hdl_debug(hdl,
1771 			    "\n%s:cpu_id %d: error way = %d\n",
1772 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1773 			    xr->xr_error_way);
1774 			break;
1775 		default:
1776 			fmd_hdl_debug(hdl, "Unexpected fault detected\n");
1777 			xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1778 	}
1779 	if ((type == CMD_PTR_CPU_L2DATA) &&
1780 	    (xr->xr_cache_data != NULL) &&
1781 	    (!pn_there_is_a_matching_synd(hdl, xr))) {
1782 		fmd_hdl_debug(hdl, "No matching syndrome\n");
1783 	}
1784 	Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(xr->xr_cpu, type,
1785 	    xr->xr_error_index, xr->xr_error_way, bit);
1786 
1787 	if (Lxcache == NULL) {
1788 		fmd_hdl_debug(hdl,
1789 		    "\n%s: cpu %d: creating a case for index %d way %d"
1790 		    " bit %d\n",
1791 		    cache_ed->ed_fltnm, xr->xr_cpuid,
1792 		    xr->xr_error_index, xr->xr_error_way, bit);
1793 		Lxcache = cmd_Lxcache_create(hdl, xr, xr->xr_cpu,
1794 		    xr->xr_cpu->cpu_asru_nvl,
1795 		    type, xr->xr_error_index,
1796 		    xr->xr_error_way, bit);
1797 		if (Lxcache == NULL) {
1798 			fmd_hdl_debug(hdl,
1799 			    "\n%s:cpu_id %d:Failed to create a Lxcache for"
1800 			    " index %d way %d bit %d\n",
1801 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1802 			    Lxcache->Lxcache_index,
1803 			    Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
1804 			return (CMD_EVD_BAD);
1805 		}
1806 	}
1807 	if (cmd_create_case_for_Lxcache(hdl, cpu, Lxcache) == B_FALSE)
1808 		return (CMD_EVD_BAD);
1809 	if (Lxcache->Lxcache_case.cc_serdnm == NULL) {
1810 		Lxcache->Lxcache_case.cc_serdnm =
1811 		    cmd_Lxcache_serdnm_create(hdl, xr->xr_cpuid,
1812 		    type, xr->xr_error_index, xr->xr_error_way, bit);
1813 
1814 		if (!fmd_serd_exists(hdl,
1815 		    Lxcache->Lxcache_case.cc_serdnm)) {
1816 			fmd_serd_create(hdl,
1817 			    Lxcache->Lxcache_case.cc_serdnm,
1818 			    cache_ed->ed_serd->cs_n,
1819 			    cache_ed->ed_serd->cs_t);
1820 			fmd_hdl_debug(hdl,
1821 			    "\n%s: cpu_id %d: created a SERD engine %s\n",
1822 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1823 			    Lxcache->Lxcache_case.cc_serdnm);
1824 		}
1825 	}
1826 	/* Ensure that our case is not solved */
1827 	if ((Lxcache->Lxcache_case.cc_cp != NULL) &&
1828 	    fmd_case_solved(hdl, Lxcache->Lxcache_case.cc_cp)) {
1829 		fmd_hdl_debug(hdl,
1830 		    "\n%s:cpu %d: the case for %s is already solved.\n",
1831 		    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1832 		    Lxcache->Lxcache_bufname);
1833 		return (CMD_EVD_REDUND);
1834 	}
1835 
1836 	fmd_hdl_debug(hdl,
1837 	    "\n%s:cpu_id %d: checking if SERD engine %s has fired.\n",
1838 	    cache_ed->ed_fltnm, xr->xr_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1839 
1840 	if (fmd_serd_record(hdl, Lxcache->Lxcache_case.cc_serdnm, ep)
1841 	    == FMD_B_FALSE)
1842 		return (CMD_EVD_OK); /* serd engine hasn't fired yet */
1843 
1844 	fmd_hdl_debug(hdl, "\n%s: cpu_id = %d creating fault %s\n",
1845 	    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1846 	    Lxcache->Lxcache_case.cc_serdnm);
1847 	fmd_case_add_serd(hdl, Lxcache->Lxcache_case.cc_cp,
1848 	    Lxcache->Lxcache_case.cc_serdnm);
1849 	fmd_serd_reset(hdl, Lxcache->Lxcache_case.cc_serdnm);
1850 	/*
1851 	 * Find out if there is a way at the fault index/bit that was retired
1852 	 * as suspect. We need this information for both anonymous way and
1853 	 * identified way handling. We store this info in suspect_Lxcache.
1854 	 */
1855 	fmd_hdl_debug(hdl,
1856 	    "\n%s:cpu_id %d checking if there is a way at"
1857 	    " index %d retired as suspect due to bit %d\n",
1858 	    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1859 	    Lxcache->Lxcache_index, Lxcache->Lxcache_bit);
1860 	suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1861 	    cpu, type, Lxcache->Lxcache_index, Lxcache->Lxcache_bit,
1862 	    CMD_LXSUSPECT_DATA);
1863 	if (xr->xr_error_way != (uint32_t)CMD_ANON_WAY) {
1864 		/*
1865 		 * IDENTIFIED WAY DATA error handling.
1866 		 *
1867 		 * If there is a way at that index retired as suspect due
1868 		 * to that bit, unretire it.
1869 		 * retire the identified way, and mark the way as "convicted"
1870 		 * for this bit. Destroy any anonymous SERD engine named by
1871 		 * that index and bit.
1872 		 */
1873 		if (suspect_Lxcache != NULL) {
1874 			fmd_hdl_debug(hdl,
1875 			    "\n%s:cpu_id %d found index %d way %d"
1876 			    " bit %d retired on suspicion. Will"
1877 			    "  unretire this now.\n",
1878 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1879 			    suspect_Lxcache->Lxcache_index,
1880 			    suspect_Lxcache->Lxcache_way,
1881 			    suspect_Lxcache->Lxcache_bit);
1882 			/*
1883 			 * unretire the retired_way.
1884 			 */
1885 			if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache,
1886 			    cache_ed->ed_fltnm) == B_TRUE) {
1887 				suspect_Lxcache->Lxcache_reason =
1888 				    CMD_LXFUNCTIONING;
1889 				cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
1890 			}
1891 			/*
1892 			 * We proceed to retire the identified way even if
1893 			 * we are unable to unretire the suspect way.
1894 			 * We will not end up retiring all 4 ways because
1895 			 * we check the actual number of ways retired
1896 			 * at this index by reading the info from processor
1897 			 * directly. The call to get_index_retired_ways() does
1898 			 * that.
1899 			 */
1900 		}
1901 		/*
1902 		 * Before retiring a way check if we have already
1903 		 * retired 3 ways for this index.
1904 		 */
1905 		ways_retired = get_index_retired_ways(cpu, type,
1906 		    Lxcache->Lxcache_index);
1907 		if (ways_retired == -1) {
1908 			fmd_hdl_debug(hdl,
1909 			    "\n%s: cpu %d: We are unable to determine how many"
1910 			    " ways are retired at this index. We will not be"
1911 			    " retiring the identified cacheline at index %d"
1912 			    " way %d\n",
1913 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1914 			    Lxcache->Lxcache_index, Lxcache->Lxcache_way);
1915 			return (CMD_EVD_BAD);
1916 		}
1917 		if (ways_retired >= 3) {
1918 			fmd_hdl_debug(hdl,
1919 			    "\n%s: cpu %d: num of ways retired for index %d"
1920 			    " is %d. Will fault the CPU\n",
1921 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1922 			    Lxcache->Lxcache_index, ways_retired);
1923 			cmd_fault_the_cpu(hdl, cpu, type, cache_ed->ed_fltnm);
1924 			return (CMD_EVD_OK);
1925 		}
1926 		/*
1927 		 * retire the cache line
1928 		 */
1929 		ret = cmd_Lxcache_retire_as_reason(hdl, cpu, Lxcache,
1930 		    cache_ed->ed_fltnm, CMD_LXCONVICTED);
1931 		if (ret != CMD_EVD_OK)
1932 			return (ret);
1933 		/*
1934 		 * anonymous serd engines for DATA faults will have valid bit
1935 		 * but way as -1.
1936 		 */
1937 		cmd_Lxcache_destroy_anonymous_serd_engines(hdl, cpu, type,
1938 		    Lxcache->Lxcache_index,
1939 		    bit);
1940 		return (CMD_EVD_OK);
1941 	}	/* end of IDENTIFIED WAY error handling */
1942 	/*
1943 	 * ANONYMOUS WAY DATA error handling.
1944 	 *
1945 	 * - if a way at this index has already been retired as "suspect"
1946 	 * for this bit, unretire that way, and retire the next retirable
1947 	 * way as "suspect" for this bit.
1948 	 * - if no ways have been retired as "suspect" for this bit,
1949 	 * retire the lowest unretired way as "suspect" for this bit.
1950 	 * - if there is no next retirable way, fault the CPU.
1951 	 */
1952 	/*
1953 	 * The assignment below is to make the code easier to maintain.
1954 	 * We need to destroy the anonymous_Lxcache after we have
1955 	 * identifed a way to retire. If we cannot detrmine a way to
1956 	 * retire we will destrory the anonymous_Lxcache and fault the cpu.
1957 	 */
1958 	anonymous_Lxcache = Lxcache;
1959 	anonymous_Lxcache->Lxcache_ep = ep;
1960 	if (suspect_Lxcache != NULL) {
1961 		ret = unretire_suspect_and_retire_next_retirable_way(hdl,
1962 		    cpu, suspect_Lxcache, anonymous_Lxcache,
1963 		    cache_ed->ed_fltnm);
1964 	} else {
1965 		ret = retire_lowest_retirable_way_as_suspect(hdl, cpu,
1966 		    anonymous_Lxcache, cache_ed->ed_fltnm);
1967 	}
1968 	return (ret);
1969 }
1970 
1971 /* ARGSUSED */
1972 int
cmd_xr_pn_cache_fill(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_xr_t * xr,cmd_cpu_t * cpu,cmd_errcl_t clcode)1973 cmd_xr_pn_cache_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr,
1974     cmd_cpu_t *cpu, cmd_errcl_t clcode)
1975 {
1976 	struct ch_ec_data *data_ptr;
1977 	uint64_t *cache_data = NULL;
1978 	uint_t sz;
1979 
1980 	if (cpu->cpu_pers.cpup_type != CPU_ULTRASPARC_IVplus)
1981 		return (0);
1982 
1983 	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
1984 	    &xr->xr_detector_nvlist) != 0) {
1985 		fmd_hdl_debug(hdl, "look up for FM_EREPORT_DETECTOR failed\n");
1986 		return (-1);
1987 	}
1988 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_AFSR,
1989 	    &xr->xr_afsr) != 0) {
1990 		fmd_hdl_debug(hdl,
1991 		    "look up for FM_EREPORT_PAYLOAD_NAME_AFSR failed\n");
1992 		return (-1);
1993 	}
1994 
1995 	/* check clcode for l2/l3 first */
1996 	if (CMD_ERRCL_ISL3XXCU(clcode)) {
1997 		if (nvlist_lookup_uint8(nvl, FM_EREPORT_PAYLOAD_NAME_L3_WAYS,
1998 		    &xr->xr_num_ways) != 0) {
1999 			fmd_hdl_debug(hdl,
2000 		    "look up for FM_EREPORT_PAYLOAD_NAME_L3_WAYS failed\n");
2001 			return (-1);
2002 		}
2003 
2004 		if (nvlist_lookup_uint64_array(nvl,
2005 		    FM_EREPORT_PAYLOAD_NAME_L3_DATA, (uint64_t **)&cache_data,
2006 		    &sz) != 0) {
2007 			fmd_hdl_debug(hdl,
2008 		    "look up for FM_EREPORT_PAYLOAD_NAME_L3_DATA failed\n");
2009 		}
2010 	} else {
2011 		if (nvlist_lookup_uint8(nvl, FM_EREPORT_PAYLOAD_NAME_L2_WAYS,
2012 		    &xr->xr_num_ways) != 0) {
2013 			fmd_hdl_debug(hdl,
2014 		    "look up for FM_EREPORT_PAYLOAD_NAME_L2_WAYS failed\n");
2015 			return (-1);
2016 		}
2017 
2018 		if (nvlist_lookup_uint64_array(nvl,
2019 		    FM_EREPORT_PAYLOAD_NAME_L2_DATA, (uint64_t **)&cache_data,
2020 		    &sz) != 0) {
2021 			fmd_hdl_debug(hdl,
2022 		    "look up for FM_EREPORT_PAYLOAD_NAME_L2_DATA failed\n");
2023 		}
2024 	}
2025 	if (xr->xr_num_ways > PN_CACHE_NWAYS) {
2026 		fmd_hdl_debug(hdl,
2027 		    "xr_num_ways > PN_CACHE_WAYS\n");
2028 		return (-1);
2029 	}
2030 
2031 	xr->xr_cache_data = cache_data;
2032 	data_ptr = (struct ch_ec_data *)cache_data;
2033 	if (cache_data == NULL) {
2034 		xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
2035 		return (0);
2036 	}
2037 
2038 	/*
2039 	 * Our error handler checks for a matching valid way
2040 	 * If there is a match, there is only 1 data set, the set
2041 	 * associated with the cache-line/way that was "valid"
2042 	 * Otherwise, it stores all of the ways
2043 	 */
2044 	xr->xr_error_tag = data_ptr[0].ec_tag;
2045 	xr->xr_error_way = (uint32_t)data_ptr[0].ec_way;
2046 
2047 	/* If there is more than 1 way structure, set way to Anonymous */
2048 	if (xr->xr_num_ways > 1)
2049 		xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
2050 
2051 	return (0);
2052 }
2053