1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27 /*
28 * Support routines for managing per-Lxcache state.
29 */
30
31 #include <sys/types.h>
32 #include <errno.h>
33 #include <strings.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <stropts.h>
38 #include <fm/fmd_api.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/cpu/UltraSPARC-III.h>
41 #include <sys/cpuvar.h>
42 #include <cmd_Lxcache.h>
43 #include <cmd_mem.h>
44 #include <cmd_cpu.h>
45 #include <cmd_state.h>
46 #include <cmd.h>
47 #define _KERNEL
48 #include <sys/cheetahregs.h>
49 #include <sys/mem_cache.h>
50 #undef _KERNEL
51 #include <sys/errclassify.h>
52 #include <sys/fm/io/sun4upci.h>
53
54 #include <fmd_adm.h>
55 #include <fmd_adm_impl.h>
56 #include <fmd_rpc_adm.h>
57
58 #define PN_CACHE_ERRORS (CMD_ERRCL_UCC | CMD_ERRCL_WDC | \
59 CMD_ERRCL_CPC | CMD_ERRCL_EDC | \
60 CMD_ERRCL_L3_UCC | CMD_ERRCL_L3_CPC |\
61 CMD_ERRCL_L3_WDC | CMD_ERRCL_L3_EDC)
62
63 /* Note that these are the same for panther L2 and L3 (see prm) */
64
65 #define LX_INDEX_MASK PN_L2_INDEX_MASK
66 #define LX_INDEX_SHIFT 6
67 #define PN_ECSTATE_NA 5
68 #define PN_ECSTATE_INV 0
69
70 #define PN_L3_INDEX_MASK PN_L3_TAG_RD_MASK
71
72 static const errdata_t l3errdata =
73 { &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_LxCACHE_CASE };
74 static const errdata_t l2errdata =
75 { &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_LxCACHE_CASE };
76
77 /* Macro for putting 64-bit onto stack as two 32-bit ints */
78 #define PRTF_64_TO_32(x) (uint32_t)((x)>>32), (uint32_t)(x)
79
80 #define LX_PA_MASK2_32BIT_CORRECT 16
81 #define LX_PA_MASK3_32BIT_CORRECT 24
82 #define LX_PA_MASK2 0x7fffff8
83 #define LX_PA_MASK3 0x7ffff8
84
85
86 #define MAX_RETRIES_FOR_ECC_MATCH 3
87 #define PN_TAG_ECC_MASK 0x7fc0
88 #define PN_L2_PTAG_SHIFT 19
89 #define PN_L3_PTAG_SHIFT 24
90 #define L2_PTAG_MASK 0xffffff
91 #define L3_PTAG_MASK 0xfffff
92 #define BIT_MASK 0x7f
93 #define MSB_BIT 0x8000
94 #define SET_MSB_BIT 0x8000
95 #define CLEAR_MSB_BIT 0x7fff
96 #define PN_LX_TAG_ECC_START_BIT 6
97 #define PN_LX_TAG_ECC_END_BIT 14
98 #define PN_LX_STATE_END_BIT 2
99 #define PN_LX_NUM_OF_BITS_IN_ECC 9
100
101 #define LX_NWAYS 4
102
103 int test_mode = 0; /* should be 0 in production version. */
104 #define FM_EREPORT_RECHECK_OF_TAGS "recheck_tags"
105 #define RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO 3
106 uint32_t cmd_Lxcache_recheck_tags_delay
107 [RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO + 1] = {0, 1, 2, 4};
108
109 /*
110 * e (for ecctable) maps single bit positions (0-127, or 0-0x7F) to the
111 * corresponding ECC syndromes for an error in that position.
112 */
113 int e[] = {
114 /* From Table P-4, JPS1 US-III Supplement */
115 /* 0 1 2 3 4 5 6 7 */
116 /* 00 */ 0x03B, 0x127, 0x067, 0x097, 0x10F, 0x08F, 0x04F, 0x02C,
117 /* 08 */ 0x147, 0x0C7, 0x02F, 0x01C, 0x117, 0x032, 0x08A, 0x04A,
118 /* 10 */ 0x01F, 0x086, 0x046, 0x026, 0x09B, 0x08C, 0x0C1, 0x0A1,
119 /* 18 */ 0x01A, 0x016, 0x061, 0x091, 0x052, 0x00E, 0x109, 0x029,
120 /* 20 */ 0x02A, 0x019, 0x105, 0x085, 0x045, 0x025, 0x015, 0x103,
121 /* 28 */ 0x031, 0x00D, 0x083, 0x043, 0x051, 0x089, 0x023, 0x007,
122 /* 30 */ 0x0B9, 0x049, 0x013, 0x0A7, 0x057, 0x00B, 0x07A, 0x187,
123 /* 38 */ 0x0F8, 0x11B, 0x079, 0x034, 0x178, 0x1D8, 0x05B, 0x04C,
124 /* 40 */ 0x064, 0x1B4, 0x037, 0x03D, 0x058, 0x13C, 0x1B1, 0x03E,
125 /* 48 */ 0x1C3, 0x0BC, 0x1A0, 0x1D4, 0x1CA, 0x190, 0x124, 0x13A,
126 /* 50 */ 0x1C0, 0x188, 0x122, 0x114, 0x184, 0x182, 0x160, 0x118,
127 /* 58 */ 0x181, 0x150, 0x148, 0x144, 0x142, 0x141, 0x130, 0x0A8,
128 /* 60 */ 0x128, 0x121, 0x0E0, 0x094, 0x112, 0x10C, 0x0D0, 0x0B0,
129 /* 68 */ 0x10A, 0x106, 0x062, 0x1B2, 0x0C8, 0x0C4, 0x0C2, 0x1F0,
130 /* 70 */ 0x0A4, 0x0A2, 0x098, 0x1D1, 0x070, 0x1E8, 0x1C6, 0x1C5,
131 /* 78 */ 0x068, 0x1E4, 0x1E2, 0x1E1, 0x1D2, 0x1CC, 0x1C9, 0x1B8,
132 /* Now we have the check bits */
133 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 */
134 0x001, 0x002, 0x004, 0x008, 0x010, 0x020, 0x040, 0x080, 0x100,
135 };
136
137 #define NBITS (sizeof (e)/sizeof (e[0]))
138 #define NDATABITS (128)
139 /*
140 * This table is used to determine which bit(s) is(are) bad when an ECC
141 * error occurs. The array is indexed by an 9-bit syndrome. The entries
142 * of this array have the following semantics:
143 *
144 * 00-127 The number of the bad bit, when only one bit is bad.
145 * 128 ECC bit C0 is bad.
146 * 129 ECC bit C1 is bad.
147 * 130 ECC bit C2 is bad.
148 * 131 ECC bit C3 is bad.
149 * 132 ECC bit C4 is bad.
150 * 133 ECC bit C5 is bad.
151 * 134 ECC bit C6 is bad.
152 * 135 ECC bit C7 is bad.
153 * 136 ECC bit C8 is bad.
154 * 137-143 reserved for Mtag Data and ECC.
155 * 144(M2) Two bits are bad within a nibble.
156 * 145(M3) Three bits are bad within a nibble.
157 * 146(M3) Four bits are bad within a nibble.
158 * 147(M) Multiple bits (5 or more) are bad.
159 * 148 NO bits are bad.
160 * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
161 */
162
163 #define C0 128
164 #define C1 129
165 #define C2 130
166 #define C3 131
167 #define C4 132
168 #define C5 133
169 #define C6 134
170 #define C7 135
171 #define C8 136
172 #define MT0 137 /* Mtag Data bit 0 */
173 #define MT1 138
174 #define MT2 139
175 #define MTC0 140 /* Mtag Check bit 0 */
176 #define MTC1 141
177 #define MTC2 142
178 #define MTC3 143
179 #define M2 144
180 #define M3 145
181 #define M4 146
182 #define M 147
183 #define NA 148
184 #if defined(JALAPENO) || defined(SERRANO)
185 #define S003 149 /* Syndrome 0x003 => likely from CPU/EDU:ST/FRU/BP */
186 #define S003MEM 150 /* Syndrome 0x003 => likely from WDU/WBP */
187 #define SLAST S003MEM /* last special syndrome */
188 #else /* JALAPENO || SERRANO */
189 #define S003 149 /* Syndrome 0x003 => likely from EDU:ST */
190 #define S071 150 /* Syndrome 0x071 => likely from WDU/CPU */
191 #define S11C 151 /* Syndrome 0x11c => likely from BERR/DBERR */
192 #define SLAST S11C /* last special syndrome */
193 #endif /* JALAPENO || SERRANO */
194 #if defined(JALAPENO) || defined(SERRANO)
195 #define BPAR0 152 /* syndrom 152 through 167 for bus parity */
196 #define BPAR15 167
197 #endif /* JALAPENO || SERRANO */
198
199 static uint8_t ecc_syndrome_tab[] =
200 {
201 NA, C0, C1, S003, C2, M2, M3, 47, C3, M2, M2, 53, M2, 41, 29, M,
202 C4, M, M, 50, M2, 38, 25, M2, M2, 33, 24, M2, 11, M, M2, 16,
203 C5, M, M, 46, M2, 37, 19, M2, M, 31, 32, M, 7, M2, M2, 10,
204 M2, 40, 13, M2, 59, M, M2, 66, M, M2, M2, 0, M2, 67, 71, M,
205 C6, M, M, 43, M, 36, 18, M, M2, 49, 15, M, 63, M2, M2, 6,
206 M2, 44, 28, M2, M, M2, M2, 52, 68, M2, M2, 62, M2, M3, M3, M4,
207 M2, 26, 106, M2, 64, M, M2, 2, 120, M, M2, M3, M, M3, M3, M4,
208 #if defined(JALAPENO) || defined(SERRANO)
209 116, M2, M2, M3, M2, M3, M, M4, M2, 58, 54, M2, M, M4, M4, M3,
210 #else /* JALAPENO || SERRANO */
211 116, S071, M2, M3, M2, M3, M, M4, M2, 58, 54, M2, M, M4, M4, M3,
212 #endif /* JALAPENO || SERRANO */
213 C7, M2, M, 42, M, 35, 17, M2, M, 45, 14, M2, 21, M2, M2, 5,
214 M, 27, M, M, 99, M, M, 3, 114, M2, M2, 20, M2, M3, M3, M,
215 M2, 23, 113, M2, 112, M2, M, 51, 95, M, M2, M3, M2, M3, M3, M2,
216 103, M, M2, M3, M2, M3, M3, M4, M2, 48, M, M, 73, M2, M, M3,
217 M2, 22, 110, M2, 109, M2, M, 9, 108, M2, M, M3, M2, M3, M3, M,
218 102, M2, M, M, M2, M3, M3, M, M2, M3, M3, M2, M, M4, M, M3,
219 98, M, M2, M3, M2, M, M3, M4, M2, M3, M3, M4, M3, M, M, M,
220 M2, M3, M3, M, M3, M, M, M, 56, M4, M, M3, M4, M, M, M,
221 C8, M, M2, 39, M, 34, 105, M2, M, 30, 104, M, 101, M, M, 4,
222 #if defined(JALAPENO) || defined(SERRANO)
223 M, M, 100, M, 83, M, M2, 12, 87, M, M, 57, M2, M, M3, M,
224 #else /* JALAPENO || SERRANO */
225 M, M, 100, M, 83, M, M2, 12, 87, M, M, 57, S11C, M, M3, M,
226 #endif /* JALAPENO || SERRANO */
227 M2, 97, 82, M2, 78, M2, M2, 1, 96, M, M, M, M, M, M3, M2,
228 94, M, M2, M3, M2, M, M3, M, M2, M, 79, M, 69, M, M4, M,
229 M2, 93, 92, M, 91, M, M2, 8, 90, M2, M2, M, M, M, M, M4,
230 89, M, M, M3, M2, M3, M3, M, M, M, M3, M2, M3, M2, M, M3,
231 86, M, M2, M3, M2, M, M3, M, M2, M, M3, M, M3, M, M, M3,
232 M, M, M3, M2, M3, M2, M4, M, 60, M, M2, M3, M4, M, M, M2,
233 M2, 88, 85, M2, 84, M, M2, 55, 81, M2, M2, M3, M2, M3, M3, M4,
234 77, M, M, M, M2, M3, M, M, M2, M3, M3, M4, M3, M2, M, M,
235 74, M, M2, M3, M, M, M3, M, M, M, M3, M, M3, M, M4, M3,
236 M2, 70, 107, M4, 65, M2, M2, M, 127, M, M, M, M2, M3, M3, M,
237 80, M2, M2, 72, M, 119, 118, M, M2, 126, 76, M, 125, M, M4, M3,
238 M2, 115, 124, M, 75, M, M, M3, 61, M, M4, M, M4, M, M, M,
239 M, 123, 122, M4, 121, M4, M, M3, 117, M2, M2, M3, M4, M3, M, M,
240 111, M, M, M, M4, M3, M3, M, M, M, M3, M, M3, M2, M, M
241 };
242
243 #define ESYND_TBL_SIZE (sizeof (ecc_syndrome_tab) / sizeof (uint8_t))
244
245 int8_t L2TAG_bit_to_way_map[128] = {
246 /* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
247 /* 1 */ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 0, 0, 0, 0,
248 /* 2 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
249 /* 3 */ 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
250 /* 4 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -1, -1, -1, -1,
251 /* 5 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
252 /* 6 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
253 /* 7 */ 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
254 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, -1, -1,
255 };
256
257 uint8_t L2TAG_bit_to_way_bit[128] = {
258 /* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
259 /* 1 */ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 19, 20, 21, 22,
260 /* 2 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
261 /* 3 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
262 /* 4 */31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, C0, C0, C0, C0,
263 /* 5 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 19, 20, 21, 22,
264 /* 6 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
265 /* 7 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
266 /* 8 */31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, C0, C0, C0, C0,
267 };
268
269 int8_t L3TAG_bit_to_way_map[128] = {
270 /* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
271 /* 1 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3,
272 /* 2 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3,
273 /* 3 */ 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, -1, -1,
274 /* 4 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
275 /* 5 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
276 /* 6 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
277 /* 7 */ 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, -1, -1,
278 /* 8 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
279 };
280
281 uint8_t L3TAG_bit_to_way_bit[128] = {
282 /* 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */
283 /* 1 */ 0, 0, 1, 1, 2, 2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
284 /* 2 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
285 /* 3 */37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, C0, C0,
286 /* 4 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
287 /* 5 */ 0, 0, 1, 1, 2, 2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
288 /* 6 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
289 /* 7 */37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, C0, C0,
290 /* 8 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
291 };
292
293 uint16_t
calcecc(uint64_t chi,uint64_t clo)294 calcecc(uint64_t chi, uint64_t clo)
295 {
296 int i;
297 uint64_t syndrome = 0;
298
299 for (i = 0; i < (NDATABITS/2); i++) {
300 syndrome ^= ((chi & 1) ? e[(NDATABITS/2) + i] : 0) ^
301 ((clo & 1) ? e[i] : 0);
302 chi >>= 1;
303 clo >>= 1;
304 }
305 return (uint16_t)(syndrome);
306 }
307
308 uint64_t
calcsynd(uint64_t chi,uint64_t clo,uint64_t ecc)309 calcsynd(uint64_t chi, uint64_t clo, uint64_t ecc)
310 {
311 return (calcecc(chi, clo) ^ ecc);
312 }
313
314 static uint8_t
tag_bit_to_way_bit(cmd_ptrsubtype_t pstype,int16_t tag_bit)315 tag_bit_to_way_bit(cmd_ptrsubtype_t pstype, int16_t tag_bit)
316 {
317 uint8_t way_bit = C0;
318
319 switch (pstype) {
320 case CMD_PTR_CPU_L2TAG:
321 way_bit = L2TAG_bit_to_way_bit[tag_bit];
322 break;
323 case CMD_PTR_CPU_L3TAG:
324 way_bit = L3TAG_bit_to_way_bit[tag_bit];
325 break;
326 }
327 return (way_bit);
328 }
329
330 static int8_t
bit_to_way(cmd_ptrsubtype_t pstype,uint32_t bit)331 bit_to_way(cmd_ptrsubtype_t pstype, uint32_t bit)
332 {
333 int8_t way = -1;
334
335 switch (pstype) {
336 case CMD_PTR_CPU_L2TAG:
337 way = L2TAG_bit_to_way_map[bit & BIT_MASK];
338 break;
339 case CMD_PTR_CPU_L3TAG:
340 way = L3TAG_bit_to_way_map[bit & BIT_MASK];
341 break;
342 }
343 return (way);
344 }
345
346 static int32_t
get_index(cmd_ptrsubtype_t pstype,uint64_t tag_afar)347 get_index(cmd_ptrsubtype_t pstype, uint64_t tag_afar)
348 {
349 int32_t index = -1;
350
351 switch (pstype) {
352 case CMD_PTR_CPU_L2TAG:
353 index = (int32_t)((tag_afar & PN_L2_INDEX_MASK)
354 >> PN_CACHE_LINE_SHIFT);
355 break;
356 case CMD_PTR_CPU_L3TAG:
357 index = (int32_t)((tag_afar & PN_L3_TAG_RD_MASK)
358 >> PN_CACHE_LINE_SHIFT);
359 break;
360 }
361 return (index);
362 }
363
364 static int
get_retired_ways(uint64_t * tag_data)365 get_retired_ways(uint64_t *tag_data)
366 {
367 int i, retired_ways;
368
369 retired_ways = 0;
370 for (i = 0; i < PN_CACHE_NWAYS; i++) {
371 if ((tag_data[i] & CH_ECSTATE_MASK) ==
372 PN_ECSTATE_NA)
373 retired_ways++;
374 }
375 return (retired_ways);
376 }
377
378 static cmd_evdisp_t
extract_data_from_ereport_payload(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,uint64_t * afarp,uint64_t * tag_data,const char * fltnm)379 extract_data_from_ereport_payload(fmd_hdl_t *hdl, nvlist_t *nvl,
380 cmd_cpu_t *cpu,
381 cmd_ptrsubtype_t pstype,
382 uint64_t *afarp, uint64_t *tag_data,
383 const char *fltnm)
384 {
385 ch_ec_data_t *ec_data;
386 char *payload_namep;
387 int tag_afar_status;
388 uint64_t tag_afar;
389 int i;
390 uint_t sz;
391 int32_t index;
392 int32_t recheck_of_tags;
393
394 tag_afar_status = cmd_afar_valid(hdl, nvl, 0, &tag_afar);
395 if (tag_afar_status == -1) {
396 fmd_hdl_debug(hdl,
397 "\n%s:cpu_id = %d Invalid afar status in nvlist\n",
398 fltnm, cpu->cpu_cpuid);
399 return (CMD_EVD_BAD);
400 }
401 *afarp = tag_afar;
402 index = get_index(pstype, tag_afar);
403 switch (pstype) {
404 case CMD_PTR_CPU_L2TAG:
405 payload_namep = FM_EREPORT_PAYLOAD_NAME_L2_DATA;
406 break;
407 case CMD_PTR_CPU_L3TAG:
408 payload_namep = FM_EREPORT_PAYLOAD_NAME_L3_DATA;
409 break;
410 default:
411 return (CMD_EVD_BAD);
412 }
413 if (nvlist_lookup_int32(nvl, FM_EREPORT_RECHECK_OF_TAGS,
414 &recheck_of_tags) != 0)
415 recheck_of_tags = 0;
416 if ((recheck_of_tags) || (test_mode))
417 return (get_tagdata(cpu, pstype, index, tag_data));
418 if (nvlist_lookup_uint64_array(nvl, payload_namep,
419 (uint64_t **)&ec_data, &sz) != 0) {
420 fmd_hdl_debug(hdl,
421 "\n%s: cpu_id = %d index = %d could not find %s"
422 " in nvlist\n",
423 fltnm, cpu->cpu_cpuid, index, payload_namep);
424 fmd_hdl_debug(hdl,
425 "\n%s: cpu_id = %d Reading tag data through"
426 " mem_cache driver.\n",
427 fltnm, cpu->cpu_cpuid);
428 return (get_tagdata(cpu, pstype, index,
429 tag_data));
430 }
431 for (i = 0; i < PN_CACHE_NWAYS; i++) {
432 tag_data[i] = ec_data[i].ec_tag;
433 }
434 return (CMD_EVD_OK);
435 }
436
437 static void
print_ecc(fmd_hdl_t * hdl,cmd_cpu_t * cpu,const char * fltnm,uint64_t * tag_data)438 print_ecc(fmd_hdl_t *hdl, cmd_cpu_t *cpu, const char *fltnm, uint64_t *tag_data)
439 {
440 int i;
441 uint16_t tag_ecc[PN_CACHE_NWAYS];
442
443 for (i = 0; i < PN_CACHE_NWAYS; i++) {
444 tag_ecc[i] =
445 ((tag_data[i] & PN_TAG_ECC_MASK)
446 >> PN_LX_TAG_ECC_START_BIT);
447 }
448 fmd_hdl_debug(hdl,
449 "\n%s: cpu_id = %d ecc[0] = 0x%03x ecc[1] = 0x%03x"
450 " ecc[2] = 0x%03x ecc[3] = 0x%03x\n",
451 fltnm, cpu->cpu_cpuid, tag_ecc[0], tag_ecc[1], tag_ecc[2],
452 tag_ecc[3]);
453
454 }
455
456 static int
matching_ecc(uint64_t * tag_data)457 matching_ecc(uint64_t *tag_data)
458 {
459 int i;
460 uint16_t tag_ecc[PN_CACHE_NWAYS];
461
462 for (i = 0; i < PN_CACHE_NWAYS; i++) {
463 tag_ecc[i] =
464 ((tag_data[i] & PN_TAG_ECC_MASK)
465 >> PN_LX_TAG_ECC_START_BIT);
466 if (tag_ecc[i] != tag_ecc[0]) {
467 return (1);
468 }
469 }
470 return (0);
471 }
472
473 static void
gen_data_for_ecc(uint64_t * tag_data,uint64_t * data_for_ecc_gen,cmd_ptrsubtype_t pstype)474 gen_data_for_ecc(uint64_t *tag_data, uint64_t *data_for_ecc_gen,
475 cmd_ptrsubtype_t pstype)
476 {
477 uint64_t ptag[PN_CACHE_NWAYS];
478 uint8_t state[PN_CACHE_NWAYS];
479 int i;
480 uint16_t tag_ecc[PN_CACHE_NWAYS];
481 uint8_t bit_position;
482
483 for (i = 0; i < PN_CACHE_NWAYS; i++) {
484 state[i] = tag_data[i] & CH_ECSTATE_MASK;
485 tag_ecc[i] =
486 ((tag_data[i] & PN_TAG_ECC_MASK)
487 >> PN_LX_TAG_ECC_START_BIT);
488 switch (pstype) {
489 case CMD_PTR_CPU_L2TAG:
490 ptag[i] = (tag_data[i] >> PN_L2_PTAG_SHIFT) &
491 L2_PTAG_MASK;
492 break;
493 case CMD_PTR_CPU_L3TAG:
494 ptag[i] = (tag_data[i] >> PN_L3_PTAG_SHIFT) &
495 L3_PTAG_MASK;
496 break;
497 }
498 }
499 /*
500 * We now assemble the 128 bit data swizzling the Physical tags
501 * and states we obtained for all the 4 ways.
502 */
503 data_for_ecc_gen[0] = 0; /* high order 64 bits */
504 data_for_ecc_gen[1] = 0; /* low order 64 bits */
505 switch (pstype) {
506 case CMD_PTR_CPU_L2TAG:
507 data_for_ecc_gen[1] = state[0]; /* way 0 state */
508 data_for_ecc_gen[1] |=
509 (state[1] << 3); /* way 1 state */
510 data_for_ecc_gen[1] |=
511 (state[2] << 6); /* way 2 state */
512 data_for_ecc_gen[1] |=
513 (state[3] << 9); /* way 3 state */
514 data_for_ecc_gen[1] |= (ptag[0] << 12); /* way 0 ptag */
515 data_for_ecc_gen[1] |= (ptag[2] << 36); /* way 2 ptag */
516 /* bits 63:60 of low order 64 bits are 0s */
517
518 /*
519 * We now start with hig order 64 bits.
520 * the low 12 bits are 0s
521 */
522 data_for_ecc_gen[0] |= (ptag[1] << 12); /* way 1 ptag */
523 data_for_ecc_gen[0] |= (ptag[3] << 36); /* way 3 ptag */
524 break;
525 case CMD_PTR_CPU_L3TAG:
526 bit_position = 0;
527 /*
528 * Swizzle state bits for way 1 and way 3
529 */
530 for (i = 0; i < 3; i++) {
531 data_for_ecc_gen[1] |=
532 (((state[1] >> i) & 1) << bit_position);
533 bit_position++;
534 data_for_ecc_gen[1] |=
535 (((state[3] >> i) & 1) << bit_position);
536 bit_position++;
537 }
538 /*
539 * Swizzle physical tag bits for way 1 and way 3
540 */
541 for (i = 0; i < 20; i++) {
542 data_for_ecc_gen[1] |=
543 (((ptag[1] >> i) & 1) << bit_position);
544 bit_position++;
545 data_for_ecc_gen[1] |=
546 (((ptag[3] >> i) & 1) << bit_position);
547 bit_position++;
548 }
549 /*
550 * start the high order 64 bits.
551 */
552 bit_position = 0;
553 /*
554 * Swizzle state bits for way 0 and way 2
555 */
556 for (i = 0; i < 3; i++) {
557 data_for_ecc_gen[0] |=
558 (((state[0] >> i) & 1) << bit_position);
559 bit_position++;
560 data_for_ecc_gen[0] |=
561 (((state[2] >> i) & 1) << bit_position);
562 bit_position++;
563 }
564 /*
565 * Swizzle physical tag bits for way 0 and way 2
566 */
567 for (i = 0; i < 20; i++) {
568 data_for_ecc_gen[0] |=
569 (((ptag[0] >> i) & 1) << bit_position);
570 bit_position++;
571 data_for_ecc_gen[0] |=
572 (((ptag[2] >> i) & 1) << bit_position);
573 bit_position++;
574 }
575 break;
576 }
577 }
578
579 static uint16_t
compute_syndrome(uint64_t * tag_data,cmd_ptrsubtype_t pstype)580 compute_syndrome(uint64_t *tag_data, cmd_ptrsubtype_t pstype)
581 {
582 uint64_t tag_synd;
583 uint64_t data_for_ecc_gen[2];
584 uint16_t tag_ecc;
585
586 gen_data_for_ecc(tag_data, data_for_ecc_gen, pstype);
587 tag_ecc = ((tag_data[0] & PN_TAG_ECC_MASK) >> PN_LX_TAG_ECC_START_BIT);
588 tag_synd = calcsynd(data_for_ecc_gen[0], data_for_ecc_gen[1],
589 (uint64_t)tag_ecc);
590 return (tag_synd);
591 }
592
593 static int16_t
find_bit_stickiness(uint64_t * tag_data,int8_t way,int16_t bit)594 find_bit_stickiness(uint64_t *tag_data, int8_t way, int16_t bit)
595 {
596 int16_t sticky_bit;
597
598 sticky_bit = bit;
599 if ((tag_data[way] & ((uint64_t)1 << bit)) != 0)
600 sticky_bit |= MSB_BIT;
601 return (sticky_bit);
602 }
603
604 static cmd_Lxcache_t *
cmd_create_and_destroy_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache)605 cmd_create_and_destroy_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
606 cmd_Lxcache_t *Lxcache)
607 {
608 const char *fltnm;
609 cmd_Lxcache_t *new_Lxcache;
610
611 fltnm = cmd_type_to_str(Lxcache->Lxcache_type);
612
613 /*
614 * We first create a new Lxcache and add the event ep
615 * that is in Lxcache to the new case we create.
616 * we then destroy the Lxcache that has the event ep in its SERD engine.
617 */
618 new_Lxcache = cmd_Lxcache_create(hdl, Lxcache->xr, cpu,
619 cpu->cpu_asru_nvl,
620 Lxcache->Lxcache_type,
621 Lxcache->Lxcache_index, Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
622 if (new_Lxcache == NULL) {
623 fmd_hdl_debug(hdl,
624 "\n%s:cpu_id %d:Failed to create a Lxcache for"
625 " index %d way %d bit %d\n",
626 fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_index,
627 Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
628 return (NULL);
629 }
630 (void) cmd_create_case_for_Lxcache(hdl, cpu, new_Lxcache);
631 cmd_Lxcache_destroy(hdl, cpu, Lxcache);
632 return (new_Lxcache);
633 }
634
635 int
cmd_Lxcache_retire_as_reason(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache,const char * fltnm,int32_t reason)636 cmd_Lxcache_retire_as_reason(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
637 cmd_Lxcache_t *Lxcache, const char *fltnm, int32_t reason)
638 {
639 boolean_t ret;
640 uint_t certainty;
641
642 if (reason == CMD_LXSUSPECT_0_TAG) {
643 /*
644 * clear MSB bit to retire as SUSPECT_0_TAG
645 * We need to update the Lxcache asru to reflect
646 * the change in bit value.
647 */
648 Lxcache->Lxcache_bit &= CLEAR_MSB_BIT;
649 errno = nvlist_add_uint16(
650 Lxcache->Lxcache_asru_nvl,
651 FM_FMRI_CPU_CACHE_BIT,
652 Lxcache->Lxcache_bit);
653 if (errno) {
654 fmd_hdl_debug(hdl,
655 "\n%s:cpu_id %d: failed to update",
656 " CACHE_BIT in asru.\n",
657 fltnm, cpu->cpu_cpuid);
658 return (CMD_EVD_BAD);
659 }
660 }
661 if (reason == CMD_LXCONVICTED)
662 certainty = HUNDRED_PERCENT;
663 else
664 certainty = SUSPECT_PERCENT;
665 ret = cmd_Lxcache_retire(hdl, cpu, Lxcache, fltnm, certainty);
666 if (reason == CMD_LXSUSPECT_0_TAG)
667 Lxcache->Lxcache_bit |= SET_MSB_BIT;
668 if (ret == B_FALSE)
669 return (CMD_EVD_BAD);
670 Lxcache->Lxcache_reason = reason;
671 /*
672 * Update the persistence storage of
673 * Lxcache.
674 */
675 fmd_hdl_debug(hdl,
676 "\n%s:cpu_id %d:reason = %s flags = %s\n",
677 fltnm, cpu->cpu_cpuid,
678 cmd_reason_to_str(Lxcache->Lxcache_reason),
679 cmd_flags_to_str(Lxcache->Lxcache_flags));
680 cmd_Lxcache_write(hdl, Lxcache);
681 return (CMD_EVD_OK);
682 }
683
684 int
retire_lowest_retirable_way_as_suspect(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * anonymous_Lxcache,const char * fltnm)685 retire_lowest_retirable_way_as_suspect(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
686 cmd_Lxcache_t *anonymous_Lxcache, const char *fltnm)
687 {
688 /*
689 * This routine is called only when handling anonymous TAG or DATA
690 * errors. When we exit this routine we would have destroyed the
691 * anonymous_Lxcache structure that was passed to us and created
692 * a new Lxcache if we were successful in determining a way to retire.
693 */
694 int8_t lowest_retirable_way, ways_retired;
695 int32_t reason;
696 cmd_ptrsubtype_t type;
697 cmd_Lxcache_t *new_Lxcache;
698
699 ways_retired = get_index_retired_ways(cpu,
700 anonymous_Lxcache->Lxcache_type,
701 anonymous_Lxcache->Lxcache_index);
702 if (ways_retired == -1) {
703 /*
704 * Couldn't determine how many ways have been retired at this
705 * index. Destroy the anonymous_Lxcache and return failure.
706 */
707 cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
708 return (CMD_EVD_BAD);
709 }
710 /*
711 * Before retiring a way check if we have already
712 * retired 3 ways for this index.
713 * For TAG errors we will not perform this check because
714 * we could reretire cachlines retired for DATA errors.
715 * The get_lowest_retirable_way() will ensure that we do
716 * not end up retiring all 4 ways.
717 */
718 if (!IS_TAG(anonymous_Lxcache->Lxcache_type)) {
719 if (ways_retired >= 3) {
720 fmd_hdl_debug(hdl,
721 "\n%s: cpu %d: num of ways retired for index %d"
722 " is %d will fault the CPU\n",
723 fltnm, cpu->cpu_cpuid,
724 anonymous_Lxcache->Lxcache_index, ways_retired);
725 type = anonymous_Lxcache->Lxcache_type;
726 /*
727 * destroy the anonymous_Lxcache
728 */
729 cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
730 cmd_fault_the_cpu(hdl, cpu, type, fltnm);
731 return (CMD_EVD_OK);
732 }
733 }
734 /*
735 * No ways have been retired as "SUSPECT" for this bit.
736 * We need to retire the lowest unretired way as suspect.
737 */
738 fmd_hdl_debug(hdl,
739 "\n%s: cpu_id %d Checking for the lowest retirable"
740 " way at index %d\n",
741 fltnm, cpu->cpu_cpuid, anonymous_Lxcache->Lxcache_index);
742 lowest_retirable_way = cmd_Lxcache_get_lowest_retirable_way(cpu,
743 anonymous_Lxcache->Lxcache_index, anonymous_Lxcache->Lxcache_type);
744 if (lowest_retirable_way != -1) {
745 fmd_hdl_debug(hdl,
746 "\n%s: cpu_id %d lowest retirable way is %d\n",
747 fltnm, cpu->cpu_cpuid, lowest_retirable_way);
748 anonymous_Lxcache->Lxcache_way = lowest_retirable_way;
749 new_Lxcache = cmd_create_and_destroy_Lxcache(hdl, cpu,
750 anonymous_Lxcache);
751 if ((new_Lxcache == NULL) ||
752 (new_Lxcache->Lxcache_case.cc_cp == NULL)) {
753 return (CMD_EVD_BAD);
754 }
755 if (IS_TAG(new_Lxcache->Lxcache_type))
756 reason = CMD_LXSUSPECT_0_TAG;
757 else
758 reason = CMD_LXSUSPECT_DATA;
759 return (cmd_Lxcache_retire_as_reason(hdl, cpu, new_Lxcache,
760 fltnm, reason));
761 } else {
762 fmd_hdl_debug(hdl,
763 "\n%s:cpu_id %d we are unable to determine which"
764 " way is faulty at cache index %d."
765 " Will retire the CPU.\nRecommended-Action:"
766 " Service action required\n",
767 fltnm, cpu->cpu_cpuid, anonymous_Lxcache->Lxcache_index);
768 type = anonymous_Lxcache->Lxcache_type;
769 /*
770 * destroy the anonymous_Lxcache
771 */
772 cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
773 cmd_fault_the_cpu(hdl, cpu, type, fltnm);
774 return (CMD_EVD_OK);
775 }
776 }
777
778 int
unretire_suspect_and_retire_next_retirable_way(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * suspect_Lxcache,cmd_Lxcache_t * anonymous_Lxcache,const char * fltnm)779 unretire_suspect_and_retire_next_retirable_way(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
780 cmd_Lxcache_t *suspect_Lxcache, cmd_Lxcache_t *anonymous_Lxcache,
781 const char *fltnm)
782 {
783 int8_t retired_way, next_retirable_way;
784 int32_t retired_index;
785 cmd_ptrsubtype_t retired_type;
786 int32_t reason;
787 cmd_Lxcache_t *new_Lxcache;
788
789 /*
790 * This routine is called only when handling anonymous TAG or DATA
791 * errors. When we exit this routine we would have destroyed the
792 * anonymous_Lxcache structure that was passed to us.
793 */
794 fmd_hdl_debug(hdl,
795 "\n%s:cpu_id %d found index %d way %d"
796 " bit %d retired as %s. Will unretire this now.\n",
797 fltnm, cpu->cpu_cpuid, suspect_Lxcache->Lxcache_index,
798 suspect_Lxcache->Lxcache_way, suspect_Lxcache->Lxcache_bit,
799 cmd_reason_to_str(suspect_Lxcache->Lxcache_reason));
800 /*
801 * Save the way because we will destroy the
802 * suspect_Lxcache after we successfully unretire it.
803 */
804 retired_way = suspect_Lxcache->Lxcache_way;
805 retired_index = suspect_Lxcache->Lxcache_index;
806 retired_type = suspect_Lxcache->Lxcache_type;
807 /*
808 * unretire the retired_way.
809 */
810 if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache,
811 fltnm)
812 == B_TRUE) {
813 suspect_Lxcache->Lxcache_reason =
814 CMD_LXFUNCTIONING;
815 fmd_hdl_debug(hdl,
816 "\n%s:cpu_id %d index %d way %d"
817 " successfully unretired. Will"
818 " destroy this Lxcache now.\n",
819 fltnm, cpu->cpu_cpuid, suspect_Lxcache->Lxcache_index,
820 suspect_Lxcache->Lxcache_way);
821 cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
822 } else {
823 /*
824 * destroy the anonymous_Lxcache
825 */
826 cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
827 return (CMD_EVD_BAD);
828 }
829 /*
830 * retire the next retirable way
831 */
832 next_retirable_way = cmd_Lxcache_get_next_retirable_way(cpu,
833 retired_index,
834 retired_type, retired_way);
835 if (next_retirable_way == -1) {
836 /*
837 * There is no retirable way that is next to the
838 * one we just retired. We need to offline the
839 * CPU since we are unable to determine which
840 * way is reporting the errors.
841 */
842 fmd_hdl_debug(hdl,
843 "\n%s:cpu_id %d we are unable to determine"
844 " which way is faulty at cache index %d."
845 " It is likely that we have a leaky bit"
846 " that gets corrected.\n Will retire"
847 " the CPU.\nRecommended-Action: Service"
848 " action required\n",
849 fltnm, cpu->cpu_cpuid, retired_index);
850 /*
851 * destroy the anonymous_Lxcache
852 */
853 cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
854 cmd_fault_the_cpu(hdl, cpu, retired_type, fltnm);
855 return (CMD_EVD_OK);
856 } else {
857 fmd_hdl_debug(hdl,
858 "\n%s:cpu_id %d found way %d at index %d to"
859 " retire as SUSPECT_0/SUSPECT_DATA\n",
860 fltnm, cpu->cpu_cpuid, next_retirable_way, retired_index);
861 /*
862 * We need to create a new Lxcache struture.
863 * The existing Lxcache is for anonymous way.
864 */
865 anonymous_Lxcache->Lxcache_way = next_retirable_way;
866 new_Lxcache = cmd_create_and_destroy_Lxcache(hdl,
867 cpu, anonymous_Lxcache);
868 if ((new_Lxcache == NULL) ||
869 (new_Lxcache->Lxcache_case.cc_cp == NULL)) {
870 return (CMD_EVD_BAD);
871 }
872 if (IS_TAG(new_Lxcache->Lxcache_type))
873 reason = CMD_LXSUSPECT_0_TAG;
874 else
875 reason = CMD_LXSUSPECT_DATA;
876 return (cmd_Lxcache_retire_as_reason(hdl, cpu, new_Lxcache,
877 fltnm, reason));
878 }
879 }
880
881 void
find_and_destroy_anonymous_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index)882 find_and_destroy_anonymous_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
883 cmd_ptrsubtype_t pstype, int32_t index)
884 {
885 cmd_Lxcache_t *anonymous_Lxcache;
886 const char *fltnm;
887
888 fltnm = cmd_type_to_str(pstype);
889 anonymous_Lxcache =
890 cmd_Lxcache_lookup_by_type_index_way_bit(cpu,
891 pstype, index, -1, -1);
892 if (anonymous_Lxcache != NULL) {
893 fmd_hdl_debug(hdl,
894 "\n%s:cpu_id = %d index = %d We are destroying the"
895 " anonymous Lxcache now.\n",
896 fltnm, cpu->cpu_cpuid, index);
897 /*
898 * Free the resources allocated to handle
899 * recheck_of_tags. Delete the Lxcache.
900 */
901 cmd_Lxcache_destroy(hdl, cpu,
902 anonymous_Lxcache);
903 }
904 }
905
906 void
cmd_Lxcache_anonymous_tag_error_timeout(fmd_hdl_t * hdl,id_t id)907 cmd_Lxcache_anonymous_tag_error_timeout(fmd_hdl_t *hdl, id_t id)
908 {
909 cmd_Lxcache_t *Lxcache;
910 const char *class;
911
912
913 /*
914 * We search thru the entire Lxcache structures to find
915 * a matching id.
916 */
917 Lxcache = cmd_Lxcache_lookup_by_timeout_id(id);
918 if (Lxcache == NULL) {
919 fmd_hdl_debug(hdl,
920 "Could not find Lxcache for timeout_id 0x%x\n", id);
921 return;
922 }
923 fmd_hdl_debug(hdl,
924 "\n%s:anonymous_tag_error_timeout:index = %d\n",
925 cmd_type_to_str(Lxcache->Lxcache_type),
926 Lxcache->Lxcache_index);
927 /*
928 * Set timeout_id to -1 to indicate that we have processed the
929 * timeout.
930 */
931 Lxcache->Lxcache_timeout_id = -1;
932 switch (Lxcache->Lxcache_type) {
933 case CMD_PTR_CPU_L2TAG:
934 class = "ereport.cpu.ultraSPARC-IVplus.thce";
935 (void) cmd_txce(hdl, Lxcache->Lxcache_ep,
936 Lxcache->Lxcache_nvl,
937 class, Lxcache->Lxcache_clcode);
938 break;
939 case CMD_PTR_CPU_L3TAG:
940 class = "ereport.cpu.ultraSPARC-IVplus.l3-thce";
941 (void) cmd_l3_thce(hdl, Lxcache->Lxcache_ep,
942 Lxcache->Lxcache_nvl,
943 class, Lxcache->Lxcache_clcode);
944 break;
945 default:
946 fmd_hdl_debug(hdl,
947 "Unexpected pstype 0x%x found in"
948 " anonymous_tag_error_timeout: index = %d\n",
949 Lxcache->Lxcache_type,
950 Lxcache->Lxcache_index);
951 return;
952 }
953 }
954
955 cmd_evdisp_t
cmd_us4plus_tag_err(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,const char * serdn,const char * serdt,const char * fltnm,cmd_errcl_t clcode)956 cmd_us4plus_tag_err(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
957 cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
958 const char *serdn, const char *serdt,
959 const char *fltnm, cmd_errcl_t clcode)
960 {
961 uint64_t tag_afar;
962 int32_t index;
963 int8_t way;
964 int16_t tag_bit, bit, sticky_bit;
965 cmd_Lxcache_t *Lxcache, *suspect_Lxcache, *retired_Lxcache;
966 cmd_Lxcache_t *anonymous_Lxcache;
967 uint64_t tag_synd;
968 uint64_t tag_data[PN_CACHE_NWAYS];
969 uint8_t state;
970 int ways_retired, ret;
971 int retries_for_ecc_match;
972 int32_t recheck_of_tags;
973 int way_already_retired = 0;
974
975 /*
976 * We now extract physical tags and states
977 * and also look for matching ECC on all 4 ways.
978 */
979 ret = extract_data_from_ereport_payload(hdl, nvl, cpu, pstype,
980 &tag_afar, tag_data, fltnm);
981 if (ret != 0)
982 return (ret);
983 index = get_index(pstype, tag_afar);
984 retries_for_ecc_match = 0;
985 while (matching_ecc(tag_data) != 0) {
986 if (retries_for_ecc_match >= MAX_RETRIES_FOR_ECC_MATCH)
987 return (CMD_EVD_BAD);
988 print_ecc(hdl, cpu, fltnm, tag_data);
989 fmd_hdl_debug(hdl,
990 "\n%s:cpu_id = %d index = %d ECCs don't match.\n"
991 "Reading tag info again.\n",
992 fltnm, cpu->cpu_cpuid, index);
993 (void) get_tagdata(cpu, pstype, index, tag_data);
994 retries_for_ecc_match++;
995 }
996 ways_retired = get_retired_ways(tag_data);
997 fmd_hdl_debug(hdl,
998 "\n%s:cpu_id %d: found %d ways retired at the index %d\n",
999 fltnm, cpu->cpu_cpuid, ways_retired, index);
1000 tag_synd = compute_syndrome(tag_data, pstype);
1001 ret = nvlist_lookup_int32(nvl, FM_EREPORT_RECHECK_OF_TAGS,
1002 &recheck_of_tags);
1003 if (ret != CMD_EVD_OK) {
1004 fmd_hdl_debug(hdl,
1005 "ret value = %d for nvlist_lookup of recheck_of_tags\n",
1006 ret);
1007 recheck_of_tags = 0;
1008 }
1009 if (tag_synd == 0) {
1010 /*
1011 * The bit has been corrected by writeback, we will
1012 * first check if we are processing the re-check of tags
1013 * that we scheduled thru the timeout call.
1014 * if so we will exit if we reached the max retries.
1015 * Else we start a timeout and exit.
1016 * We will create a Lxcache structure for this index with way
1017 * as -1 and bit as -1. We will also keep a count of
1018 * attempts we made to check the tag data at this index.
1019 *
1020 */
1021 way = -1;
1022 bit = -1;
1023 Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(cpu, pstype,
1024 index, way, bit);
1025 if (recheck_of_tags) {
1026 /*
1027 * We are processing the re-read of tags scheduled by
1028 * timeout. Exit if retry limit has been
1029 * reached. Else start another timeout.
1030 */
1031 if (Lxcache == NULL) {
1032 /*
1033 * This shouldn't happen.
1034 */
1035 fmd_hdl_debug(hdl,
1036 "\n%s: cpu_id = %d failed to lookup"
1037 " index = %d way %d bit %d\n",
1038 fltnm, cpu->cpu_cpuid, index, way, bit);
1039 return (CMD_EVD_BAD);
1040 }
1041 fmd_hdl_debug(hdl,
1042 "\n%s: cpu_id = %d index = %d syndrome"
1043 " computed is 0 in attempt #%d.\n",
1044 fltnm, cpu->cpu_cpuid, index,
1045 Lxcache->Lxcache_retry_count);
1046 if (Lxcache->Lxcache_retry_count >=
1047 RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO) {
1048 /*
1049 * We free only the nvl list here.
1050 * anonymous SERD engine will be freed
1051 * when the Lxcache gets destroyed.
1052 * We need the anonymous SERD engine still
1053 * because it has the event ep.
1054 * reset or destroy of SERD engine frees the
1055 * event ep.
1056 */
1057 if (Lxcache->Lxcache_nvl != NULL) {
1058 nvlist_free(Lxcache->Lxcache_nvl);
1059 Lxcache->Lxcache_nvl = NULL;
1060 }
1061 fmd_hdl_debug(hdl,
1062 "\n%s:cpu_id %d Max retry count reached. Giving up.\n",
1063 fltnm, cpu->cpu_cpuid);
1064 Lxcache->Lxcache_timeout_id = -1;
1065 Lxcache->Lxcache_retry_count = 0;
1066 goto process_after_finding_way_bit;
1067 } else {
1068 Lxcache->Lxcache_retry_count++;
1069 Lxcache->Lxcache_timeout_id =
1070 fmd_timer_install(hdl,
1071 (void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR,
1072 NULL,
1073 (cmd_Lxcache_recheck_tags_delay[
1074 Lxcache->Lxcache_retry_count] * NANOSEC));
1075 return (CMD_EVD_OK);
1076 }
1077 }
1078 /*
1079 * Check if we already have a Lxcache structure
1080 * with anonymous way and bit created.
1081 */
1082 if (Lxcache == NULL) {
1083 Lxcache = cmd_Lxcache_create(hdl, 0, cpu,
1084 cpu->cpu_asru_nvl, pstype, index, way, bit);
1085 if (Lxcache == NULL) {
1086 fmd_hdl_debug(hdl,
1087 "\n%s:cpu_id %d Failed to create Lxcache"
1088 " for index=%d\n",
1089 fltnm, cpu->cpu_cpuid, index);
1090 return (CMD_EVD_BAD);
1091 }
1092 }
1093 if (Lxcache->Lxcache_timeout_id != -1) {
1094 /*
1095 * We have another syndrome = 0 condition while we are
1096 * still in the process of retrying for the previous
1097 * condition.
1098 */
1099 fmd_hdl_debug(hdl,
1100 "\n%s: cpu_id = %d index = %d We have another"
1101 " syndrome = 0 condition while we have already"
1102 " scheduled a timeout. We will ignore this"
1103 " event.\n",
1104 fltnm, cpu->cpu_cpuid, index);
1105 return (CMD_EVD_OK);
1106 }
1107 fmd_hdl_debug(hdl,
1108 "\n%s: cpu_id = %d index = %d syndrome computed is 0."
1109 "Looks like the bit got corrected."
1110 " Will check later to see if it is OK.\n",
1111 fltnm, cpu->cpu_cpuid, index);
1112 /*
1113 * We need to store the following arguments passed to
1114 * this function(tag_error_handler) so that we can
1115 * invoke this function from timeout routine.
1116 *
1117 * nvl, ep, clcode
1118 */
1119 if (Lxcache->Lxcache_nvl == NULL) {
1120 if (nvlist_dup(nvl, &Lxcache->Lxcache_nvl, 0) != 0) {
1121 fmd_hdl_debug(hdl,
1122 "\n%s:cpu_id %d Failed to duplicate nvl"
1123 " for index=%d\n",
1124 fltnm, cpu->cpu_cpuid, index);
1125 return (CMD_EVD_BAD);
1126 }
1127 if (nvlist_add_int32(Lxcache->Lxcache_nvl,
1128 FM_EREPORT_RECHECK_OF_TAGS, 1) != 0) {
1129 fmd_hdl_debug(hdl,
1130 "\n%s:cpu_id %d Failed to add"
1131 " RECHECK_OF_TAGS in nvl for index=%d\n",
1132 fltnm, cpu->cpu_cpuid, index);
1133 return (CMD_EVD_BAD);
1134 }
1135 }
1136 /*
1137 * We are called with CMP_CPU_LEVEL_CORE masked out
1138 * from cmd_txce(), cmd_l3_thce() routines.
1139 * We need to set CMD_CPU_LEVEL_CORE because we want to handle
1140 * both the cores on the Chip as one single cpu_id.
1141 */
1142 Lxcache->Lxcache_clcode = (clcode | CMD_CPU_LEVEL_CORE);
1143 if (Lxcache->Lxcache_ep == NULL) {
1144 Lxcache->Lxcache_ep = ep;
1145 /*
1146 * we need to preserve the event ep so that it does
1147 * not get destroyed when we return from this call.
1148 * We do that by adding the event ep to the SERD engine.
1149 * The SERD engine we create is different from the one
1150 * we create when we handle the actual event at label
1151 * process_after_finding_way_bit.
1152 */
1153 Lxcache->Lxcache_serdnm =
1154 cmd_Lxcache_anonymous_serdnm_create(hdl,
1155 cpu->cpu_cpuid, pstype, index,
1156 way, bit);
1157 if (!fmd_serd_exists(hdl, Lxcache->Lxcache_serdnm)) {
1158 fmd_serd_create(hdl, Lxcache->Lxcache_serdnm,
1159 fmd_prop_get_int32(hdl, serdn),
1160 fmd_prop_get_int64(hdl, serdt));
1161 fmd_hdl_debug(hdl,
1162 "\n%s: cpu_id %d: created a SERD engine"
1163 " %s\n",
1164 fltnm, cpu->cpu_cpuid,
1165 Lxcache->Lxcache_serdnm);
1166 }
1167 (void) fmd_serd_record(hdl,
1168 Lxcache->Lxcache_serdnm,
1169 ep);
1170 }
1171 Lxcache->Lxcache_retry_count++;
1172 Lxcache->Lxcache_timeout_id =
1173 fmd_timer_install(hdl,
1174 (void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR, NULL,
1175 (cmd_Lxcache_recheck_tags_delay[
1176 Lxcache->Lxcache_retry_count] * NANOSEC));
1177 return (CMD_EVD_OK);
1178
1179 } else {
1180 /*
1181 * tag_synd != 0
1182 * determine way and bit
1183 */
1184 tag_bit = ecc_syndrome_tab[tag_synd & 0x1ff];
1185 fmd_hdl_debug(hdl,
1186 "\n%s: cpu_id = %d index = %d tag_bit %03d is faulty.\n",
1187 fltnm, cpu->cpu_cpuid, index, tag_bit);
1188 if ((tag_bit > C8)) {
1189 fmd_hdl_debug(hdl, "%s: cpu_id = %d"
1190 " Unexpected MTAG or Multiple bit error detected\n",
1191 fltnm, cpu->cpu_cpuid);
1192 find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype,
1193 index);
1194 return (CMD_EVD_BAD);
1195 }
1196 if ((tag_bit >= C0) && (tag_bit <= C8)) {
1197 /*
1198 * ECC bit is corrupted.
1199 * Need to offline the CPU
1200 */
1201 bit = (tag_bit - C0) + PN_LX_TAG_ECC_START_BIT;
1202 way = 0;
1203 fmd_hdl_debug(hdl,
1204 "\n%s: cpu_id = %d ECC bit is faulty.\n",
1205 fltnm, cpu->cpu_cpuid);
1206 } else {
1207 bit = tag_bit_to_way_bit(pstype, tag_bit);
1208 way = bit_to_way(pstype, tag_bit);
1209 if (way < 0) {
1210 fmd_hdl_debug(hdl,
1211 "\n%s: cpu_id = %d %d bit indicted is a"
1212 " meta bit !!\n",
1213 fltnm, cpu->cpu_cpuid, bit);
1214 find_and_destroy_anonymous_Lxcache(hdl, cpu,
1215 pstype,
1216 index);
1217 return (CMD_EVD_BAD);
1218 }
1219 }
1220 } /* end of tag_synd != 0 */
1221 process_after_finding_way_bit:
1222 if ((Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(cpu, pstype,
1223 index, way,
1224 bit)) != NULL &&
1225 Lxcache->Lxcache_case.cc_cp != NULL &&
1226 fmd_case_solved(hdl, Lxcache->Lxcache_case.cc_cp)) {
1227 fmd_hdl_debug(hdl,
1228 "\n%s:cpu %d: the case for %s is already solved.\n",
1229 fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_bufname);
1230 find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1231 return (CMD_EVD_REDUND);
1232 }
1233
1234 if (Lxcache == NULL)
1235 Lxcache = cmd_Lxcache_create(hdl, 0, cpu, cpu->cpu_asru_nvl,
1236 pstype, index, way, bit);
1237 if (Lxcache == NULL) {
1238 fmd_hdl_debug(hdl,
1239 "\n%s:cpu %d: Failed to create Lxcache for index %d",
1240 " way %d bit %d\n",
1241 fltnm, cpu->cpu_cpuid, index, way, bit);
1242 find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1243 return (CMD_EVD_BAD);
1244 }
1245 if (cmd_create_case_for_Lxcache(hdl, cpu, Lxcache) == B_FALSE) {
1246 find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1247 return (CMD_EVD_BAD);
1248 }
1249 if (Lxcache->Lxcache_case.cc_serdnm == NULL) {
1250 Lxcache->Lxcache_case.cc_serdnm = cmd_Lxcache_serdnm_create(hdl,
1251 cpu->cpu_cpuid, pstype, index,
1252 way, bit);
1253 if (!fmd_serd_exists(hdl, Lxcache->Lxcache_case.cc_serdnm)) {
1254 fmd_serd_create(hdl, Lxcache->Lxcache_case.cc_serdnm,
1255 fmd_prop_get_int32(hdl, serdn),
1256 fmd_prop_get_int64(hdl, serdt));
1257 fmd_hdl_debug(hdl,
1258 "\n%s: cpu_id %d: created a SERD engine %s\n",
1259 fltnm, cpu->cpu_cpuid,
1260 Lxcache->Lxcache_case.cc_serdnm);
1261 }
1262 }
1263 fmd_hdl_debug(hdl,
1264 "\n%s:cpu_id %d: Checking if the SERD engine %s has fired.\n",
1265 fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1266
1267 (void) fmd_serd_record(hdl, Lxcache->Lxcache_case.cc_serdnm, ep);
1268 if (way >= 0) {
1269 /*
1270 * Now that we have recorded the event ep we can do the
1271 * necessary cleanup of resources allocated for recheck of tags.
1272 */
1273 find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1274 }
1275 if (fmd_serd_fired(hdl, Lxcache->Lxcache_case.cc_serdnm) ==
1276 FMD_B_FALSE)
1277 return (CMD_EVD_OK);
1278
1279 fmd_hdl_debug(hdl, "\n%s: cpu_id = %d creating fault %s\n",
1280 fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1281 fmd_case_add_serd(hdl, Lxcache->Lxcache_case.cc_cp,
1282 Lxcache->Lxcache_case.cc_serdnm);
1283 fmd_serd_reset(hdl, Lxcache->Lxcache_case.cc_serdnm);
1284 if (way == -1) {
1285 /*
1286 * The assignment below is to make the code easier to maintain.
1287 * We need to destroy the anonymous_Lxcache after we have
1288 * identifed a way to retire. If we cannot detrmine a way to
1289 * retire we will destrory the anonymous_Lxcache and fault the
1290 * cpu.
1291 */
1292 anonymous_Lxcache = Lxcache;
1293 /*
1294 * Anonymous TAG way retirement.
1295 * - if a way at this index has already been retired as
1296 * "suspect-1", unretire that way, and retire the next
1297 * unretired way as "suspect-0", using a pattern of all zeros
1298 * for the PA bits.
1299 * - if a way at this index has already been retired as
1300 * "suspect-0", re-retire that way as "suspect-1", using a
1301 * pattern of all ones for the PA bits.
1302 * - if no ways have been retired as "suspect" for this index,
1303 * retire the lowest unretired way as "suspect-0" for this
1304 * bit, using a pattern of all zeros for the PA bits.
1305 * - if there is no next retirable way, fault the CPU.
1306 */
1307 suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1308 cpu, pstype, index, bit, CMD_LXSUSPECT_1_TAG);
1309 anonymous_Lxcache->Lxcache_ep = ep;
1310 if (suspect_Lxcache) {
1311 ret = unretire_suspect_and_retire_next_retirable_way(
1312 hdl, cpu, suspect_Lxcache, anonymous_Lxcache,
1313 fltnm);
1314 return (ret);
1315 } /* end SUSPECT_1_TAG */
1316 suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1317 cpu, pstype, index, bit, CMD_LXSUSPECT_0_TAG);
1318 if (suspect_Lxcache) {
1319 fmd_hdl_debug(hdl,
1320 "\n%s:cpu_id %d found index %d way %d"
1321 " bit %d retired as SUSPECT_0_TAG. Will"
1322 " re-retire this now as SUSPECT_1_TAG.\n",
1323 fltnm, cpu->cpu_cpuid, index,
1324 suspect_Lxcache->Lxcache_way, bit);
1325 /*
1326 * destroy the anonymous_Lxcache
1327 */
1328 cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
1329 suspect_Lxcache->Lxcache_ep = ep;
1330 /*
1331 * We need to update the FM_FMRI_CPU_CACHE_BIT entry
1332 * in the Lxcache_asru_nvl. This entry was last updated
1333 * when the cacheline was retired as SUSPECT_0.
1334 * Therefore the MSB of FM_FMRI_CPU_CACHE_BIT entry
1335 * value will be reset. To retire cacheline as
1336 * SUSPECT_1 the MSB has to be set.
1337 */
1338 errno = nvlist_add_uint16(
1339 suspect_Lxcache->Lxcache_asru_nvl,
1340 FM_FMRI_CPU_CACHE_BIT,
1341 suspect_Lxcache->Lxcache_bit);
1342 if (errno) {
1343 fmd_hdl_debug(hdl,
1344 "\n%s:cpu_id %d: failed to update",
1345 " CACHE_BIT in asru.\n",
1346 fltnm, cpu->cpu_cpuid);
1347 }
1348 return (cmd_Lxcache_retire_as_reason(hdl, cpu,
1349 suspect_Lxcache, fltnm, CMD_LXSUSPECT_1_TAG));
1350 } /* end of SUSPECT_0_TAG */
1351 /*
1352 * No ways have been retired as "SUSPECT_x" for this bit.
1353 * We need to retire the lowest unretired way as suspect.
1354 */
1355 ret = retire_lowest_retirable_way_as_suspect(hdl, cpu,
1356 anonymous_Lxcache,
1357 fltnm);
1358 return (ret);
1359 } /* End of Anonymous TAG retirement */
1360 /*
1361 * Identified bit and way has fired.
1362 * - Destroy any anonymous SERD engine at that index.
1363 * - If the bad bit is an ECC bit, fault the CPU.
1364 * - If the way was already convicted due to tag errors, fault the CPU.
1365 * - If the bad bit is a state bit, then:
1366 * - if the stable value of the bad bit will hold the NA encoding,
1367 * retire the containing way as "convicted".
1368 * - if the stable value of the bad bit will not hold the NA
1369 * encoding, fault the CPU.
1370 */
1371 cmd_Lxcache_destroy_anonymous_serd_engines(hdl, cpu, pstype, index, -1);
1372 sticky_bit = find_bit_stickiness(tag_data, way, bit);
1373 if ((bit >= PN_LX_TAG_ECC_START_BIT) &&
1374 (bit <= PN_LX_TAG_ECC_END_BIT)) {
1375 fmd_hdl_debug(hdl,
1376 "\n%s:cpu_id %d Bad ECC bit %d at cache index %d way %d"
1377 " detected. Will offline the CPU.\n",
1378 fltnm, cpu->cpu_cpuid, bit, index, way);
1379 cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1380 return (CMD_EVD_OK);
1381 }
1382 /*
1383 * Check if a STATE bit is faulty.
1384 * If so we need to ensure that we will be able to
1385 * make the way NA, else fault the CPU.
1386 */
1387 if (bit <= PN_LX_STATE_END_BIT) {
1388 fmd_hdl_debug(hdl,
1389 "%s cpu_id = %d: STATE bit %d is faulty.\n",
1390 fltnm, cpu->cpu_cpuid, bit);
1391 /*
1392 * If the stable value of bit will hold the NA encoding
1393 * retire the containing way Else fault the cpu.
1394 */
1395 state = tag_data[way] & CH_ECSTATE_MASK;
1396 if ((state & (1 << bit)) != (PN_ECSTATE_NA & (1 << bit))) {
1397 /*
1398 * The stable value of the bad bit will not hold the
1399 * NA encoding. will fault the CPU.
1400 */
1401 fmd_hdl_debug(hdl,
1402 "\n%s:cpu_id %d STATE bit %d is faulty at"
1403 " cache index %d way %d. STATE = 0x%x\n"
1404 " The bad bit will not hold the encoding we need"
1405 " to mark the cacheline as retired, so will offline"
1406 " the CPU.\n",
1407 fltnm, cpu->cpu_cpuid, bit, index, way, state);
1408 cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1409 return (CMD_EVD_OK);
1410 }
1411 }
1412 /*
1413 * Check if we are getting fault on a way that is already retired.
1414 * if the way was already convicted due to tag errors, fault the CPU.
1415 * Note that the way could have previously been retired due to
1416 * data errors. This is okay; we just re-retire it due to tag errors,
1417 * so that we can write the offending tag bit to a stable value.
1418 */
1419 if ((tag_data[way] & CH_ECSTATE_MASK) == PN_ECSTATE_NA) {
1420 /*
1421 * Looking for CONVICTED TAG fault first.
1422 * If found retire the CPU.
1423 */
1424 retired_Lxcache = cmd_Lxcache_lookup_by_type_index_way_reason(
1425 cpu, pstype, index, way, CMD_LXCONVICTED);
1426 if (retired_Lxcache) {
1427 fmd_hdl_debug(hdl,
1428 "\n%s: cpu %d: The cache index %d way %d previously"
1429 " retired for %s fault at bit %d is reporting"
1430 " fault. Will fault the CPU\n",
1431 fltnm, cpu->cpu_cpuid, index, way,
1432 cmd_type_to_str(
1433 retired_Lxcache->Lxcache_type),
1434 retired_Lxcache->Lxcache_bit);
1435 cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1436 return (CMD_EVD_OK);
1437 }
1438 way_already_retired = 1;
1439 }
1440 /*
1441 * If any way(Including the current way) at this index is retired as
1442 * "suspect" due to tag errors, unretire it. (If that suspect way
1443 * really was bad, it will start producing errors again and will
1444 * eventually be retired again.)
1445 */
1446 suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1447 cpu, pstype, index, -1,
1448 (CMD_LXSUSPECT_0_TAG | CMD_LXSUSPECT_1_TAG));
1449 if (suspect_Lxcache) {
1450 fmd_hdl_debug(hdl,
1451 "\n%s:cpu_id %d found index %d way %d"
1452 " bit %d retired as SUSPECT_x. Will"
1453 " unretire this now.\n",
1454 fltnm, cpu->cpu_cpuid, index,
1455 suspect_Lxcache->Lxcache_way, -1);
1456 /*
1457 * unretire the suspect_x retired_way.
1458 */
1459 if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache, fltnm)
1460 == B_TRUE) {
1461 suspect_Lxcache->Lxcache_reason =
1462 CMD_LXFUNCTIONING;
1463 fmd_hdl_debug(hdl,
1464 "\n%s:cpu_id %d index %d way %d"
1465 " successfully unretired. Will"
1466 " destroy this Lxcache now.\n",
1467 fltnm, cpu->cpu_cpuid, index,
1468 suspect_Lxcache->Lxcache_way);
1469 cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
1470 } else {
1471 /*
1472 * We are unable to unretire the previously retired
1473 * SUSPECT way at the fault index.
1474 * If the previously retired way is same as the way
1475 * we are attempting to retire then return failure.
1476 */
1477 if (suspect_Lxcache->Lxcache_way ==
1478 Lxcache->Lxcache_way)
1479 return (CMD_EVD_BAD);
1480 }
1481 }
1482 ways_retired = get_index_retired_ways(cpu, pstype, index);
1483 if (ways_retired == -1)
1484 return (CMD_EVD_BAD);
1485 /*
1486 * Before retiring a way check if we have already
1487 * retired 3 ways for this index.
1488 * If the way was already retired due to DATA error or
1489 * SUSPECT_X TAG error then we skip the check.
1490 */
1491 if (!way_already_retired) {
1492 if (ways_retired >= 3) {
1493 fmd_hdl_debug(hdl,
1494 "\n%s: cpu %d: num of ways retired for index %d"
1495 " is %d will fault the CPU\n",
1496 fltnm, cpu->cpu_cpuid, index, ways_retired);
1497 cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1498 return (CMD_EVD_OK);
1499 }
1500 }
1501 fmd_hdl_debug(hdl,
1502 "\n%s: cpu %d: num of ways retired for index %d is %d\n",
1503 fltnm, cpu->cpu_cpuid, index, ways_retired);
1504 if ((errno = nvlist_add_uint16(Lxcache->Lxcache_asru_nvl,
1505 FM_FMRI_CPU_CACHE_BIT,
1506 sticky_bit)) != 0 ||
1507 (errno = fmd_nvl_fmri_expand(hdl, Lxcache->Lxcache_asru_nvl)) != 0)
1508 fmd_hdl_abort(hdl, "failed to build Lxcache fmri");
1509 Lxcache->Lxcache_ep = ep;
1510 return (cmd_Lxcache_retire_as_reason(hdl, cpu, Lxcache, fltnm,
1511 CMD_LXCONVICTED));
1512 }
1513
1514 static boolean_t
pn_there_is_a_matching_synd(fmd_hdl_t * hdl,cmd_xr_t * xr)1515 pn_there_is_a_matching_synd(fmd_hdl_t *hdl, cmd_xr_t *xr)
1516 {
1517 int ec_data_idx, i;
1518 int8_t way;
1519 uint64_t ec_tag, data_hi, data_lo;
1520 int ecc, calc_synd;
1521 ec_data_elm_t *ecdptr = NULL;
1522 uint8_t state;
1523 ch_ec_data_t *ecp;
1524
1525 ecp = (ch_ec_data_t *)(xr->xr_cache_data);
1526 for (way = 0; way < xr->xr_num_ways; way++, ecp++) {
1527 ec_tag = ecp->ec_tag;
1528 /*
1529 * skip Retired and Invalid ways
1530 */
1531 state = ec_tag & CH_ECSTATE_MASK;
1532 if ((state == PN_ECSTATE_NA) ||
1533 (state == CH_ECSTATE_INV))
1534 continue;
1535 /*
1536 * Each 16 bytes of data are protected by 9-bit ECC field.
1537 */
1538
1539 for (i = 0; i < (CH_ECACHE_SUBBLK_SIZE/16); i++) {
1540 ec_data_idx = (i/2);
1541
1542 ecdptr = &ecp->ec_data[ec_data_idx];
1543 if ((i & 1) == 0) {
1544 ecc = (ecdptr->ec_eccd >> 9) & 0x1ff;
1545 data_hi = ecdptr->ec_d8[0];
1546 data_lo = ecdptr->ec_d8[1];
1547 } else {
1548 ecc = ecdptr->ec_eccd & 0x1ff;
1549 data_hi = ecdptr->ec_d8[2];
1550 data_lo = ecdptr->ec_d8[3];
1551 }
1552
1553 calc_synd = calcsynd(data_hi, data_lo, ecc);
1554 if ((calc_synd != 0) &&
1555 (xr->xr_synd == calc_synd)) {
1556 if (xr->xr_num_ways == 1) {
1557 fmd_hdl_debug(hdl,
1558 "\ncomputed syndrome matches with the reported syndrome"
1559 " 0x%x index = %d way = %d\n",
1560 xr->xr_synd, xr->xr_error_index,
1561 xr->xr_error_way);
1562 } else {
1563 fmd_hdl_debug(hdl,
1564 "\ncomputed syndrome matches with"
1565 " the reported syndrome"
1566 " 0x%x index = %d way = %d\n",
1567 xr->xr_synd, xr->xr_error_index,
1568 way);
1569 xr->xr_error_way = way;
1570 }
1571 return (B_TRUE);
1572 }
1573 }
1574 }
1575 return (B_FALSE);
1576 }
1577
1578 /* add to cheetahregs.h */
1579 #define CH_ECSTATE_NA 5
1580
1581 static int32_t
pn_extract_index(int32_t type,uint64_t afar)1582 pn_extract_index(int32_t type, uint64_t afar)
1583 {
1584 int32_t index = -1;
1585
1586 switch (type) {
1587 case CMD_PTR_CPU_L2DATA:
1588 index = (int32_t)((afar & PN_L2_INDEX_MASK)
1589 >> PN_CACHE_LINE_SHIFT);
1590 break;
1591 case CMD_PTR_CPU_L3DATA:
1592 index = (int32_t)((afar & PN_L3_INDEX_MASK)
1593 >> PN_CACHE_LINE_SHIFT);
1594 break;
1595 }
1596 return (index);
1597 }
1598
1599 /*
1600 * cmd_cache_ce_panther
1601 *
1602 * This routine handles L2 and L3 cachedata errors for the Panther.
1603 * It's called when the train processing for L2 and L3 correctable
1604 * data errors are about to issue a fault.
1605 *
1606 * This routine retrieves payload information gathered during the XR
1607 * processing and generates a unique SERD engine and cache data
1608 * associated with the CPU if one does not exist.
1609 * If the SERD fires for the given engine it will initiate a cache
1610 * line fault if the way is not anonomyous.
1611 * If the way is anonomyous, it will attempt to choose a way for the
1612 * given index to fault. If the maximum for the index has not been
1613 * reached, it will attempt to unretire a different way previously retired
1614 * under suspicion for the index prior to faulting
1615 * the selected way.
1616 * The routine will also fault the CPU if the maximum number of
1617 * retired ways for the CPU has been exceeded based on the category.
1618 */
1619 /*ARGSUSED*/
1620 int
cmd_cache_ce_panther(fmd_hdl_t * hdl,fmd_event_t * ep,cmd_xr_t * xr)1621 cmd_cache_ce_panther(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_xr_t *xr)
1622 {
1623 cmd_Lxcache_t *suspect_Lxcache, *Lxcache, *anonymous_Lxcache;
1624 cmd_cpu_t *cpu = xr->xr_cpu;
1625 cmd_case_t *cpu_cc;
1626 cmd_ptrsubtype_t type;
1627 const errdata_t *cache_ed;
1628 uint16_t offset;
1629 int16_t bit;
1630 int ways_retired;
1631 int ret;
1632
1633 /*
1634 * The caller of this routine cmd_xxc_hdlr() expects us to
1635 * return CMD_EVD_OK for success and CMD_EVD_BAD for failures.
1636 * If this is not a Panther or one of the Panther specific
1637 * errors that we handle here, then exit
1638 */
1639
1640 if (cpu->cpu_pers.cpup_type != CPU_ULTRASPARC_IVplus)
1641 return (CMD_EVD_BAD);
1642
1643 if (!(xr->xr_clcode & (int)PN_CACHE_ERRORS))
1644 return (CMD_EVD_BAD);
1645
1646
1647 /* Set up Cache specific structs */
1648
1649 if (CMD_ERRCL_ISL2XXCU(xr->xr_clcode)) {
1650 type = CMD_PTR_CPU_L2DATA;
1651 cpu_cc = &cpu->cpu_l2data;
1652 cache_ed = &l2errdata;
1653 } else {
1654 type = CMD_PTR_CPU_L3DATA;
1655 cpu_cc = &cpu->cpu_l3data;
1656 cache_ed = &l3errdata;
1657 }
1658
1659 /* Ensure that our case is not solved */
1660
1661 if (cpu->cpu_faulting || (cpu_cc->cc_cp != NULL &&
1662 fmd_case_solved(hdl, cpu_cc->cc_cp)))
1663 return (CMD_EVD_OK);
1664
1665 fmd_hdl_debug(hdl, "Processing Panther %s Error\n",
1666 cache_ed->ed_fltnm);
1667
1668 /* L3 errors arrive as mem scheme errors - convert to CPU */
1669 if (type == CMD_PTR_CPU_L3DATA) {
1670 cmd_fmri_init(hdl, &xr->xr_rsrc,
1671 xr->xr_detector_nvlist, "%s_rsrc",
1672 fmd_case_uuid(hdl, xr->xr_case));
1673 }
1674 bit = (uint8_t)ecc_syndrome_tab[xr->xr_synd];
1675 offset = (uint16_t)xr->xr_afar & 0x3f;
1676 if (bit > C8) {
1677 fmd_hdl_debug(hdl, "xxC/LDxC dropped due to syndrome\n");
1678 return (CMD_EVD_BAD);
1679 }
1680 if (bit < C0) {
1681 /*
1682 * Data bit. Set bit in the range 0-511
1683 */
1684 bit += ((3 - (offset/16)) * 128);
1685 } else {
1686 /*
1687 * ECC bit. Set bit in the range 512-547
1688 */
1689 bit -= C0;
1690 bit += 512 + ((3 - (offset/16)) * PN_LX_NUM_OF_BITS_IN_ECC);
1691 }
1692 xr->xr_error_index = pn_extract_index(type, xr->xr_afar);
1693 if (xr->xr_error_index == 0xffffffff) {
1694 fmd_hdl_debug(hdl, "xxC/LDxC dropped due to index\n");
1695 return (CMD_EVD_BAD);
1696 }
1697 fmd_hdl_debug(hdl, "cpu_id: %d, syndrome: 0x%x, afar: 0x%llx\n",
1698 xr->xr_cpuid, xr->xr_synd, xr->xr_afar);
1699 fmd_hdl_debug(hdl, "index: 0x%x(%d) bit: %d\n",
1700 xr->xr_error_index, xr->xr_error_index, bit);
1701 /*
1702 * The payload information for the DATA errors are assembled
1703 * after first looking for a valid line that matches the fault AFAR.
1704 * If no match is found all 4 ways are logged and xr_num_ways
1705 * will be 4. If a matching way is found only that entry is logged
1706 * and xr_num_ways is set as 1.
1707 * The xr_error_way is set as -1 when xr_num_ways is 4, else
1708 * xr_error_way is set to the matching way.
1709 * what we do below is to force the xr_error_way to -1 for WDC/CPC
1710 * errors.
1711 * For UCC and EDC errors the xr_error_way will be set correctly.
1712 */
1713
1714 switch (xr->xr_clcode) {
1715 case CMD_ERRCL_WDC:
1716 case CMD_ERRCL_L3_WDC:
1717 /*
1718 * WDC is a disrupting trap, and invalidates and
1719 * overwrites the problematic way. Any match is due to
1720 * a refetch of the AFAR, which could have been to any
1721 * way. So these are treated as "anonymous".
1722 */
1723 fmd_hdl_debug(hdl, "WDC fault detected\n");
1724 xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1725 break;
1726 case CMD_ERRCL_CPC:
1727 case CMD_ERRCL_L3_CPC:
1728 /*
1729 * CPC is a disrupting trap, but since it happens due to
1730 * a snoop, the problematic way could become invalid,
1731 * overwritten by a different cache line, and then the
1732 * AFAR accessed and pulled into a different way,
1733 * causing a false positive match. So it's best to not
1734 * look for a matching way and just ascribe these to
1735 * the "anonymous" way.
1736 */
1737 fmd_hdl_debug(hdl, "CPC fault detected\n");
1738 xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1739 break;
1740 case CMD_ERRCL_UCC:
1741 case CMD_ERRCL_L3_UCC:
1742 /*
1743 * UCC is a precise trap, so, absent activity from the
1744 * other core, the tag address values read by the TL=1
1745 * trap handler are likely to be the same as those at
1746 * the time of the trap.
1747 * (A snoop from another CPU might cause a change in
1748 * state from valid to invalid, but the tag address
1749 * won't change.) If we find a matching valid tag,
1750 * that identifies the way.
1751 */
1752 fmd_hdl_debug(hdl, "UCC fault detected\n");
1753 fmd_hdl_debug(hdl, "# of ways collected are %d\n",
1754 xr->xr_num_ways);
1755 fmd_hdl_debug(hdl,
1756 "\n%s:cpu_id %d: error way = %d\n",
1757 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1758 xr->xr_error_way);
1759 break;
1760 case CMD_ERRCL_EDC:
1761 case CMD_ERRCL_L3_EDC:
1762 /*
1763 * EDC is a disrupting trap, but again if a matching
1764 * valid way is found, it is likely to be the correct
1765 * way.
1766 */
1767 fmd_hdl_debug(hdl, "EDC fault detected\n");
1768 fmd_hdl_debug(hdl, "# of ways collected are %d\n",
1769 xr->xr_num_ways);
1770 fmd_hdl_debug(hdl,
1771 "\n%s:cpu_id %d: error way = %d\n",
1772 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1773 xr->xr_error_way);
1774 break;
1775 default:
1776 fmd_hdl_debug(hdl, "Unexpected fault detected\n");
1777 xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1778 }
1779 if ((type == CMD_PTR_CPU_L2DATA) &&
1780 (xr->xr_cache_data != NULL) &&
1781 (!pn_there_is_a_matching_synd(hdl, xr))) {
1782 fmd_hdl_debug(hdl, "No matching syndrome\n");
1783 }
1784 Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(xr->xr_cpu, type,
1785 xr->xr_error_index, xr->xr_error_way, bit);
1786
1787 if (Lxcache == NULL) {
1788 fmd_hdl_debug(hdl,
1789 "\n%s: cpu %d: creating a case for index %d way %d"
1790 " bit %d\n",
1791 cache_ed->ed_fltnm, xr->xr_cpuid,
1792 xr->xr_error_index, xr->xr_error_way, bit);
1793 Lxcache = cmd_Lxcache_create(hdl, xr, xr->xr_cpu,
1794 xr->xr_cpu->cpu_asru_nvl,
1795 type, xr->xr_error_index,
1796 xr->xr_error_way, bit);
1797 if (Lxcache == NULL) {
1798 fmd_hdl_debug(hdl,
1799 "\n%s:cpu_id %d:Failed to create a Lxcache for"
1800 " index %d way %d bit %d\n",
1801 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1802 Lxcache->Lxcache_index,
1803 Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
1804 return (CMD_EVD_BAD);
1805 }
1806 }
1807 if (cmd_create_case_for_Lxcache(hdl, cpu, Lxcache) == B_FALSE)
1808 return (CMD_EVD_BAD);
1809 if (Lxcache->Lxcache_case.cc_serdnm == NULL) {
1810 Lxcache->Lxcache_case.cc_serdnm =
1811 cmd_Lxcache_serdnm_create(hdl, xr->xr_cpuid,
1812 type, xr->xr_error_index, xr->xr_error_way, bit);
1813
1814 if (!fmd_serd_exists(hdl,
1815 Lxcache->Lxcache_case.cc_serdnm)) {
1816 fmd_serd_create(hdl,
1817 Lxcache->Lxcache_case.cc_serdnm,
1818 cache_ed->ed_serd->cs_n,
1819 cache_ed->ed_serd->cs_t);
1820 fmd_hdl_debug(hdl,
1821 "\n%s: cpu_id %d: created a SERD engine %s\n",
1822 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1823 Lxcache->Lxcache_case.cc_serdnm);
1824 }
1825 }
1826 /* Ensure that our case is not solved */
1827 if ((Lxcache->Lxcache_case.cc_cp != NULL) &&
1828 fmd_case_solved(hdl, Lxcache->Lxcache_case.cc_cp)) {
1829 fmd_hdl_debug(hdl,
1830 "\n%s:cpu %d: the case for %s is already solved.\n",
1831 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1832 Lxcache->Lxcache_bufname);
1833 return (CMD_EVD_REDUND);
1834 }
1835
1836 fmd_hdl_debug(hdl,
1837 "\n%s:cpu_id %d: checking if SERD engine %s has fired.\n",
1838 cache_ed->ed_fltnm, xr->xr_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1839
1840 if (fmd_serd_record(hdl, Lxcache->Lxcache_case.cc_serdnm, ep)
1841 == FMD_B_FALSE)
1842 return (CMD_EVD_OK); /* serd engine hasn't fired yet */
1843
1844 fmd_hdl_debug(hdl, "\n%s: cpu_id = %d creating fault %s\n",
1845 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1846 Lxcache->Lxcache_case.cc_serdnm);
1847 fmd_case_add_serd(hdl, Lxcache->Lxcache_case.cc_cp,
1848 Lxcache->Lxcache_case.cc_serdnm);
1849 fmd_serd_reset(hdl, Lxcache->Lxcache_case.cc_serdnm);
1850 /*
1851 * Find out if there is a way at the fault index/bit that was retired
1852 * as suspect. We need this information for both anonymous way and
1853 * identified way handling. We store this info in suspect_Lxcache.
1854 */
1855 fmd_hdl_debug(hdl,
1856 "\n%s:cpu_id %d checking if there is a way at"
1857 " index %d retired as suspect due to bit %d\n",
1858 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1859 Lxcache->Lxcache_index, Lxcache->Lxcache_bit);
1860 suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1861 cpu, type, Lxcache->Lxcache_index, Lxcache->Lxcache_bit,
1862 CMD_LXSUSPECT_DATA);
1863 if (xr->xr_error_way != (uint32_t)CMD_ANON_WAY) {
1864 /*
1865 * IDENTIFIED WAY DATA error handling.
1866 *
1867 * If there is a way at that index retired as suspect due
1868 * to that bit, unretire it.
1869 * retire the identified way, and mark the way as "convicted"
1870 * for this bit. Destroy any anonymous SERD engine named by
1871 * that index and bit.
1872 */
1873 if (suspect_Lxcache != NULL) {
1874 fmd_hdl_debug(hdl,
1875 "\n%s:cpu_id %d found index %d way %d"
1876 " bit %d retired on suspicion. Will"
1877 " unretire this now.\n",
1878 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1879 suspect_Lxcache->Lxcache_index,
1880 suspect_Lxcache->Lxcache_way,
1881 suspect_Lxcache->Lxcache_bit);
1882 /*
1883 * unretire the retired_way.
1884 */
1885 if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache,
1886 cache_ed->ed_fltnm) == B_TRUE) {
1887 suspect_Lxcache->Lxcache_reason =
1888 CMD_LXFUNCTIONING;
1889 cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
1890 }
1891 /*
1892 * We proceed to retire the identified way even if
1893 * we are unable to unretire the suspect way.
1894 * We will not end up retiring all 4 ways because
1895 * we check the actual number of ways retired
1896 * at this index by reading the info from processor
1897 * directly. The call to get_index_retired_ways() does
1898 * that.
1899 */
1900 }
1901 /*
1902 * Before retiring a way check if we have already
1903 * retired 3 ways for this index.
1904 */
1905 ways_retired = get_index_retired_ways(cpu, type,
1906 Lxcache->Lxcache_index);
1907 if (ways_retired == -1) {
1908 fmd_hdl_debug(hdl,
1909 "\n%s: cpu %d: We are unable to determine how many"
1910 " ways are retired at this index. We will not be"
1911 " retiring the identified cacheline at index %d"
1912 " way %d\n",
1913 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1914 Lxcache->Lxcache_index, Lxcache->Lxcache_way);
1915 return (CMD_EVD_BAD);
1916 }
1917 if (ways_retired >= 3) {
1918 fmd_hdl_debug(hdl,
1919 "\n%s: cpu %d: num of ways retired for index %d"
1920 " is %d. Will fault the CPU\n",
1921 cache_ed->ed_fltnm, cpu->cpu_cpuid,
1922 Lxcache->Lxcache_index, ways_retired);
1923 cmd_fault_the_cpu(hdl, cpu, type, cache_ed->ed_fltnm);
1924 return (CMD_EVD_OK);
1925 }
1926 /*
1927 * retire the cache line
1928 */
1929 ret = cmd_Lxcache_retire_as_reason(hdl, cpu, Lxcache,
1930 cache_ed->ed_fltnm, CMD_LXCONVICTED);
1931 if (ret != CMD_EVD_OK)
1932 return (ret);
1933 /*
1934 * anonymous serd engines for DATA faults will have valid bit
1935 * but way as -1.
1936 */
1937 cmd_Lxcache_destroy_anonymous_serd_engines(hdl, cpu, type,
1938 Lxcache->Lxcache_index,
1939 bit);
1940 return (CMD_EVD_OK);
1941 } /* end of IDENTIFIED WAY error handling */
1942 /*
1943 * ANONYMOUS WAY DATA error handling.
1944 *
1945 * - if a way at this index has already been retired as "suspect"
1946 * for this bit, unretire that way, and retire the next retirable
1947 * way as "suspect" for this bit.
1948 * - if no ways have been retired as "suspect" for this bit,
1949 * retire the lowest unretired way as "suspect" for this bit.
1950 * - if there is no next retirable way, fault the CPU.
1951 */
1952 /*
1953 * The assignment below is to make the code easier to maintain.
1954 * We need to destroy the anonymous_Lxcache after we have
1955 * identifed a way to retire. If we cannot detrmine a way to
1956 * retire we will destrory the anonymous_Lxcache and fault the cpu.
1957 */
1958 anonymous_Lxcache = Lxcache;
1959 anonymous_Lxcache->Lxcache_ep = ep;
1960 if (suspect_Lxcache != NULL) {
1961 ret = unretire_suspect_and_retire_next_retirable_way(hdl,
1962 cpu, suspect_Lxcache, anonymous_Lxcache,
1963 cache_ed->ed_fltnm);
1964 } else {
1965 ret = retire_lowest_retirable_way_as_suspect(hdl, cpu,
1966 anonymous_Lxcache, cache_ed->ed_fltnm);
1967 }
1968 return (ret);
1969 }
1970
1971 /* ARGSUSED */
1972 int
cmd_xr_pn_cache_fill(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_xr_t * xr,cmd_cpu_t * cpu,cmd_errcl_t clcode)1973 cmd_xr_pn_cache_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr,
1974 cmd_cpu_t *cpu, cmd_errcl_t clcode)
1975 {
1976 struct ch_ec_data *data_ptr;
1977 uint64_t *cache_data = NULL;
1978 uint_t sz;
1979
1980 if (cpu->cpu_pers.cpup_type != CPU_ULTRASPARC_IVplus)
1981 return (0);
1982
1983 if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
1984 &xr->xr_detector_nvlist) != 0) {
1985 fmd_hdl_debug(hdl, "look up for FM_EREPORT_DETECTOR failed\n");
1986 return (-1);
1987 }
1988 if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_AFSR,
1989 &xr->xr_afsr) != 0) {
1990 fmd_hdl_debug(hdl,
1991 "look up for FM_EREPORT_PAYLOAD_NAME_AFSR failed\n");
1992 return (-1);
1993 }
1994
1995 /* check clcode for l2/l3 first */
1996 if (CMD_ERRCL_ISL3XXCU(clcode)) {
1997 if (nvlist_lookup_uint8(nvl, FM_EREPORT_PAYLOAD_NAME_L3_WAYS,
1998 &xr->xr_num_ways) != 0) {
1999 fmd_hdl_debug(hdl,
2000 "look up for FM_EREPORT_PAYLOAD_NAME_L3_WAYS failed\n");
2001 return (-1);
2002 }
2003
2004 if (nvlist_lookup_uint64_array(nvl,
2005 FM_EREPORT_PAYLOAD_NAME_L3_DATA, (uint64_t **)&cache_data,
2006 &sz) != 0) {
2007 fmd_hdl_debug(hdl,
2008 "look up for FM_EREPORT_PAYLOAD_NAME_L3_DATA failed\n");
2009 }
2010 } else {
2011 if (nvlist_lookup_uint8(nvl, FM_EREPORT_PAYLOAD_NAME_L2_WAYS,
2012 &xr->xr_num_ways) != 0) {
2013 fmd_hdl_debug(hdl,
2014 "look up for FM_EREPORT_PAYLOAD_NAME_L2_WAYS failed\n");
2015 return (-1);
2016 }
2017
2018 if (nvlist_lookup_uint64_array(nvl,
2019 FM_EREPORT_PAYLOAD_NAME_L2_DATA, (uint64_t **)&cache_data,
2020 &sz) != 0) {
2021 fmd_hdl_debug(hdl,
2022 "look up for FM_EREPORT_PAYLOAD_NAME_L2_DATA failed\n");
2023 }
2024 }
2025 if (xr->xr_num_ways > PN_CACHE_NWAYS) {
2026 fmd_hdl_debug(hdl,
2027 "xr_num_ways > PN_CACHE_WAYS\n");
2028 return (-1);
2029 }
2030
2031 xr->xr_cache_data = cache_data;
2032 data_ptr = (struct ch_ec_data *)cache_data;
2033 if (cache_data == NULL) {
2034 xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
2035 return (0);
2036 }
2037
2038 /*
2039 * Our error handler checks for a matching valid way
2040 * If there is a match, there is only 1 data set, the set
2041 * associated with the cache-line/way that was "valid"
2042 * Otherwise, it stores all of the ways
2043 */
2044 xr->xr_error_tag = data_ptr[0].ec_tag;
2045 xr->xr_error_way = (uint32_t)data_ptr[0].ec_way;
2046
2047 /* If there is more than 1 way structure, set way to Anonymous */
2048 if (xr->xr_num_ways > 1)
2049 xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
2050
2051 return (0);
2052 }
2053