xref: /titanic_52/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_cpu.h (revision 45e662eb8429b38c18931ebeed30f2e5287ae51b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef _CMD_CPU_H
27 #define	_CMD_CPU_H
28 
29 /*
30  * Each CPU of interest has a cmd_cpu_t structure.  CPUs become of interest when
31  * they are the focus of ereports, or when they detect UEs.  CPUs may be the
32  * target of several different kinds of ereport, each of which is tracked
33  * differently.  cpu_cases lists the types of cases that can be open against a
34  * given CPU.  The life of a CPU is complicated by the fact that xxCs and xxUs
35  * received by the DE may in fact be side-effects of earlier UEs, xxCs, or xxUs.
36  * Causes of side-effects, and actions taken to resolve them, can be found below
37  * and in cmd_memerr.h.
38  *
39  * Data structures:
40  *      ________                                   CMD_PTR_CPU_ICACHE
41  *     /        \       ,--------.                 CMD_PTR_CPU_DCACHE
42  *     |CPU     | <---- |case_ptr| (one or more of CMD_PTR_CPU_PCACHE         )
43  *     |        |       `--------'                 CMD_PTR_CPU_ITLB
44  *     |,-------|       ,-------.                  CMD_PTR_CPU_DTLB
45  *     ||asru   | ----> |fmri_t |                  CMD_PTR_CPU_L2DATA
46  *     |:-------|       :-------:                  CMD_PTR_CPU_L2DATA_UERETRY
47  *     ||fru    | ----> |fmri_t |                  CMD_PTR_CPU_L2TAG
48  *     |`-------|       `-------'                  CMD_PTR_CPU_L3DATA
49  *     |        |       ,---------.                CMD_PTR_CPU_L3DATA_UERETRY
50  *     | uec    | ----> |UE cache |                CMD_PTR_CPU_L3TAG
51  *     \________/       `---------'                CMD_PTR_CPU_FPU
52  *						   CMD_PTR_CPU_IREG
53  *						   CMD_PTR_CPU_FREG
54  *						   CMD_PTR_CPU_MAU
55  *						   CMD_PTR_CPU_L2CTL
56  *
57  *      ________
58  *     /        \       ,--------.
59  *     | xr     | <---- |case_ptr| (CMD_PTR_XR_WAITER)
60  *     |        |       `--------'
61  *     |,-------|       ,-------.
62  *     ||rsrc   | ----> |fmri_t |
63  *     |`-------|       `-------'
64  *     | cpu    | ----> detecting CPU
65  *     \________/
66  *
67  * Data structure	P?  Case- Notes
68  *                          Rel?
69  * ----------------	--- ----- --------------------------------------
70  * cmd_cpu_t		Yes No    Name is derived from CPU ID ("cpu_%d")
71  * cmd_case_ptr_t	Yes Yes   Name is case's UUID
72  * cpu_asru (fmri_t)	Yes No    Name is derived from CPU ID ("cpu_asru_%d")
73  * cpu_fru (fmri_t)	Yes No    Name is derived from CPU ID ("cpu_fru_%d")
74  * cpu_uec		Yes No    Name is derived from CPU ID ("cpu_uec_%d")
75  * cmd_xr_t		Yes Yes   Name is `redelivery'
76  * xr_rsrc (fmri_t)     Yes No    Name is derived from case's UUID ("%s_rsrc")
77  */
78 
79 #include <cmd.h>
80 #include <cmd_state.h>
81 #include <cmd_fmri.h>
82 
83 #ifdef __cplusplus
84 extern "C" {
85 #endif
86 
87 #define	CPU_FRU_FMRI		FM_FMRI_SCHEME_HC":///" \
88     FM_FMRI_LEGACY_HC"="
89 
90 #define	BK_LFUFAULT_CERT	50
91 
92 typedef struct cmd_cpu cmd_cpu_t;
93 
94 typedef enum cmd_cpu_type {
95 	CPU_ULTRASPARC_III = 1,
96 	CPU_ULTRASPARC_IIIplus,
97 	CPU_ULTRASPARC_IIIi,
98 	CPU_ULTRASPARC_IV,
99 	CPU_ULTRASPARC_IVplus,
100 	CPU_ULTRASPARC_IIIiplus,
101 	CPU_ULTRASPARC_T1,
102 	CPU_SPARC64_VI,
103 	CPU_SPARC64_VII,
104 	CPU_ULTRASPARC_T2,
105 	CPU_ULTRASPARC_T2plus
106 } cmd_cpu_type_t;
107 
108 typedef struct cmd_cpu_cases {
109 	cmd_case_t cpuc_icache;		/* All I$ errors (IPE, IDSPE, etc) */
110 	cmd_case_t cpuc_dcache;		/* All D$ errors (DPE, DDSPE, etc) */
111 	cmd_case_t cpuc_pcache;		/* All P$ errors (PDSPE) */
112 	cmd_case_t cpuc_itlb;		/* ITLB errors (ITLBPE) */
113 	cmd_case_t cpuc_dtlb;		/* DTLB errors (DTLBPE) */
114 	cmd_case_t cpuc_l2data;		/* All correctable L2$ data errors */
115 	cmd_case_t cpuc_l2tag;		/* All correctable L2$ tag errors */
116 	cmd_case_t cpuc_l3data;		/* All correctable L3$ data errors */
117 	cmd_case_t cpuc_l3tag;		/* All correctable L3$ tag errors */
118 	cmd_case_t cpuc_fpu;		/* FPU errors */
119 	cmd_case_t cpuc_ireg;		/* Integer reg errors (IRC, IRU) */
120 	cmd_case_t cpuc_freg;		/* Floatpnt reg errors (frc, fru) */
121 	cmd_case_t cpuc_mau;		/* Modular arith errors (MAU) */
122 	cmd_case_t cpuc_l2ctl;		/* L2$ directory, VUAD parity */
123 	cmd_case_t cpuc_misc_regs;	/* Scratchpad array (SCA) */
124 					/* Tick compare (TC) */
125 					/* Store buffer (SBD) */
126 					/* Trap stack array errors (TSA) */
127 	cmd_case_t cpuc_lfu;		/* Coherency link error (LFU) */
128 #ifdef sun4u
129 	cmd_case_t cpuc_opl_invsfsr;	/* Olympus-C cpu inv-sfsr errors */
130 	cmd_case_t cpuc_oplue_detcpu;	/* Olympus-C cpu det. ue (eid=CPU) */
131 	cmd_case_t cpuc_oplue_detio;	/* Olympus-C io det. ue (eid=CPU) */
132 	cmd_case_t cpuc_opl_mtlb;	/* Olympus-C mtlb errors */
133 	cmd_case_t cpuc_opl_tlbp;	/* Olympus-C tlbp errors */
134 	cmd_case_t cpuc_opl_inv_urg;	/* Olympus-C inv-urg invalid urgent */
135 	cmd_case_t cpuc_opl_cre;	/* Olympus-C cre urgent errors */
136 	cmd_case_t cpuc_opl_tsb_ctx;	/* Olympus-C tsb_ctx urgent errors */
137 	cmd_case_t cpuc_opl_tsbp;	/* Olympus-C tsbp urgent errors */
138 	cmd_case_t cpuc_opl_pstate;	/* Olympus-C pstate urgent errors */
139 	cmd_case_t cpuc_opl_tstate;	/* Olympus-C tstate urgent errors */
140 	cmd_case_t cpuc_opl_iug_f;	/* Olympus-C iug_f urgent errors */
141 	cmd_case_t cpuc_opl_iug_r;	/* Olympus-C iug_r urgent errors */
142 	cmd_case_t cpuc_opl_sdc;	/* Olympus-C sdc urgent errors */
143 	cmd_case_t cpuc_opl_wdt;	/* Olympus-C wdt urgent errors */
144 	cmd_case_t cpuc_opl_dtlb;	/* Olympus-C dtlb urgent errors */
145 	cmd_case_t cpuc_opl_itlb;	/* Olympus-C itlb urgent errors */
146 	cmd_case_t cpuc_opl_core_err;	/* Olympus-C core-err urgent errors */
147 	cmd_case_t cpuc_opl_dae;	/* Olympus-C dae urgent errors */
148 	cmd_case_t cpuc_opl_iae;	/* Olympus-C iae urgent errors */
149 	cmd_case_t cpuc_opl_uge;	/* Olympus-C uge urgent errors */
150 #endif	/* sun4u */
151 } cmd_cpu_cases_t;
152 
153 /*
154  * The UE cache.  We actually have two UE caches - the current one and the old
155  * one.  When it's time to flush the UE cache, we move the current UE cache to
156  * the old position and flush the E$.  Then, we schedule the removal of the old
157  * UE cache.  This allows a) xxUs triggered by the flush to match against the
158  * old cache, while b) still allowing new UEs to be added to the current UE
159  * cache.  UE matches will always search in both caches (if present), but
160  * additions will only end up in the current cache.  We go to all of this
161  * effort because the cost of a missed ereport (discarding due to a false match
162  * in the cache) is much less than that of a missed match.  In the latter case,
163  * the CPU will be erroneously offlined.
164  *
165  * A special case is triggered if we see a UE with a not valid AFAR.  Without
166  * the AFAR, we aren't able to properly match subsequent xxU's.  As a result,
167  * we need to throw the cache into all-match mode, wherein all subsequent match
168  * attempts will succeed until the UE cache is flushed.
169  */
170 
171 #define	CPU_UEC_F_ALLMATCH	0x1	/* all-match mode active */
172 
173 typedef struct cmd_cpu_uec {
174 	uint64_t *uec_cache;		/* The UE cache */
175 	uint_t uec_nent;		/* Number of allocated slots in cache */
176 	uint_t uec_flags;		/* CPU_UEC_F_* */
177 	char uec_bufname[CMD_BUFNMLEN];	/* Name of buffer used for cache */
178 } cmd_cpu_uec_t;
179 
180 extern const char *cmd_cpu_type2name(fmd_hdl_t *, cmd_cpu_type_t);
181 extern void cmd_cpu_uec_add(fmd_hdl_t *, cmd_cpu_t *, uint64_t);
182 extern int cmd_cpu_uec_match(cmd_cpu_t *, uint64_t);
183 extern void cmd_cpu_uec_clear(fmd_hdl_t *, cmd_cpu_t *);
184 extern void cmd_cpu_uec_set_allmatch(fmd_hdl_t *, cmd_cpu_t *);
185 
186 /*
187  * Certain types of xxC and xxU can trigger other types as side-effects.  These
188  * secondary ereports need to be discarded, as treating them as legitimate
189  * ereports in their own right will cause erroneous diagnosis.  As an example
190  * (see cmd_xxcu_trains for more), an L2$ UCC will usually trigger an L2$ WDC
191  * resulting from the trap handler's flushing of the L2$.  If we treat both as
192  * legitimate, we'll end up adding two ereports to the SERD engine,
193  * significantly cutting the threshold for retiring the CPU.
194  *
195  * Our saving grace is the fact that the side-effect ereports will have the same
196  * ENA as the primary.  As such, we can keep track of groups of ereports by ENA.
197  * These groups, which we'll call trains, can then be matched against a list of
198  * known trains.  The list (an array of cmd_xxcu_train_t structures) has both a
199  * description of the composition of the train and an indication as to which of
200  * the received ereports is the primary.
201  *
202  * The cmd_xxcu_trw_t is used to gather the members of the train.  When the
203  * first member comes in, we allocate a trw, recording the ENA of the ereport,
204  * as well as noting its class in trw_mask.  We then reschedule the delivery of
205  * the ereport for some configurable time in the future, trusting that all
206  * members of the train will have arrived by that time.  Subsequent ereports in
207  * the same train match the recorded ENA, and add themselves to the mask.
208  * When the first ereport is redelivered, trw_mask is used to determine whether
209  * or not a train has been seen.  An exact match is required.  If a match is
210  * made, the ereport indicated as the primary cause is used for diagnosis.
211  */
212 
213 #define	CMD_TRW_F_DELETING	0x1	/* reclaiming events */
214 #define	CMD_TRW_F_CAUSESEEN	0x2	/* cause of train already processed */
215 #define	CMD_TRW_F_GCSEEN	0x4	/* seen by GC, erased next time */
216 
217 typedef struct cmd_xxcu_trw {
218 	uint64_t trw_ena;	/* the ENA for this group of ereports */
219 	uint64_t trw_afar;	/* the AFAR for this group of ereports */
220 	cmd_errcl_t trw_mask;	/* ereports seen thus far with this ENA */
221 	uint16_t trw_cpuid;	/* CPU to which this watcher belongs */
222 	uint8_t	 trw_ref;	/* number of ereports with this ENA */
223 	uint8_t	 trw_flags;	/* CMD_TRW_F_* */
224 	uint32_t trw_pad;
225 } cmd_xxcu_trw_t;
226 
227 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
228 extern cmd_xxcu_trw_t *cmd_trw_alloc(uint64_t, uint64_t);
229 extern void cmd_trw_restore(fmd_hdl_t *);
230 extern void cmd_trw_write(fmd_hdl_t *);
231 extern void cmd_trw_ref(fmd_hdl_t *, cmd_xxcu_trw_t *, cmd_errcl_t);
232 extern void cmd_trw_deref(fmd_hdl_t *, cmd_xxcu_trw_t *);
233 
234 extern cmd_errcl_t cmd_xxcu_train_match(cmd_errcl_t);
235 
236 /*
237  * We don't have access to ereport nvlists when they are redelivered via timer.
238  * As such, we have to retrieve everything we might need for diagnosis when we
239  * first receive the ereport.  The retrieved information is stored in the
240  * cmd_xr_t, which is persisted.
241  */
242 
243 typedef struct cmd_xr cmd_xr_t;
244 
245 /*
246  * xr_hdlr can't be persisted, so we use these in xr_hdlrid to indicate the
247  * handler to be used.  xr_hdlr is then updated so it can be used directly.
248  */
249 #define	CMD_XR_HDLR_XXC		1
250 #define	CMD_XR_HDLR_XXU		2
251 #define	CMD_XR_HDLR_NOP		3
252 
253 typedef void cmd_xr_hdlr_f(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
254 
255 /*
256  * For sun4v, the size of xr_synd is expanded to 32 bits in order to
257  * accomodate the Niagara L2 syndrome (4x7 bits).
258  */
259 
260 struct cmd_xr {
261 	cmd_list_t xr_list;
262 	id_t xr_id;		/* ID of timer used for redelivery */
263 	cmd_cpu_t *xr_cpu;	/* Detecting CPU, recalc'd from cpuid */
264 	uint32_t xr_cpuid;	/* ID of detecting CPU */
265 	uint64_t xr_ena;	/* ENA from ereport */
266 	uint64_t xr_afar;	/* AFAR from ereport nvlist */
267 #ifdef sun4u
268 	uint16_t xr_synd;	/* syndrome from ereport nvlist */
269 #else /* sun4u */
270 	uint32_t xr_synd;	/* for Niagara, enlarged to 32 bits */
271 #endif /* sun4u */
272 	uint8_t xr_afar_status;	/* AFAR status from ereport nvlist */
273 	uint8_t xr_synd_status;	/* syndrome status from ereport nvlist */
274 	cmd_fmri_t xr_rsrc;	/* resource from ereport nvlist */
275 	cmd_errcl_t xr_clcode;	/* CMD_ERRCL_* for this ereport */
276 	cmd_xr_hdlr_f *xr_hdlr;	/* handler, recalc'd from hdlrid on restart */
277 	uint_t xr_hdlrid;	/* CMD_XR_HDLR_*, used for recalc of hdlr */
278 	fmd_case_t *xr_case;	/* Throwaway case used to track redelivery */
279 	uint_t xr_ref;		/* Number of references to this struct */
280 #ifdef sun4u
281 	uint64_t xr_afsr;	/* AFSR from ereport nvlist */
282 	uint8_t  xr_num_ways;   /* Number of Cache ways reporting from nvlist */
283 	uint32_t xr_error_way;  /* The way from the ereport nvlist payload */
284 	uint64_t xr_error_tag;  /* The tag from the ereport nvlist payload */
285 	uint32_t xr_error_index; /* the index from the ereport payload */
286 	uint64_t *xr_cache_data; /* The cache data */
287 	nvlist_t *xr_detector_nvlist; /* The detecting resource */
288 #endif
289 };
290 
291 #define	xr_rsrc_nvl		xr_rsrc.fmri_nvl
292 
293 extern cmd_xr_t *cmd_xr_create(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
294     cmd_cpu_t *, cmd_errcl_t);
295 extern cmd_evdisp_t cmd_xr_reschedule(fmd_hdl_t *, cmd_xr_t *, uint_t);
296 extern void cmd_xr_deref(fmd_hdl_t *, cmd_xr_t *);
297 extern void cmd_xr_write(fmd_hdl_t *, cmd_xr_t *);
298 
299 extern void cmd_xxc_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
300 extern void cmd_xxu_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
301 extern void cmd_nop_resolve(fmd_hdl_t *, cmd_xr_t *, fmd_event_t *);
302 extern cmd_evdisp_t cmd_xxcu_initial(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
303     const char *, cmd_errcl_t,  uint_t);
304 
305 /*
306  * The master structure containing or referencing all of the state for a given
307  * CPU.
308  */
309 
310 /*
311  * We periodically flush the E$, thus allowing us to flush the UE cache (see
312  * above for a description of the UE cache).  In particular, we flush it
313  * whenever we see a UE with a non-valid AFAR.  To keep from overflushing the
314  * CPU, we cap the number of flushes that we'll do in response to UEs with
315  * non-valid AFARs.  The cap is the number of permitted flushes per GC/restart
316  * cycle, and was determined arbitrarily.
317  */
318 #define	CPU_UEC_FLUSH_MAX	3
319 
320 /*
321  * The CPU structure started life without a version number.  Making things more
322  * complicated, the version number in the new struct occupies the space used for
323  * cpu_cpuid in the non-versioned struct.  We therefore have to use somewhat
324  * unorthodox version numbers to distinguish between the two types of struct
325  * (pre- and post-versioning) -- version numbers that can't be mistaken for
326  * CPUIDs.  Our version numbers, therefore, will be negative.
327  *
328  * For future expansion, the version member must always stay where it is.  At
329  * some point in the future, when more structs get versions, the version member
330  * should move into the cmd_header_t.
331  */
332 #define	CPU_MKVERSION(version)	((uint_t)(0 - (version)))
333 
334 #define	CMD_CPU_VERSION_1	CPU_MKVERSION(1)	/* -1 */
335 #define	CMD_CPU_VERSION_2	CPU_MKVERSION(2)	/* -2 */
336 #define	CMD_CPU_VERSION_3	CPU_MKVERSION(3)	/* -3 */
337 #define	CMD_CPU_VERSION		CMD_CPU_VERSION_3
338 
339 #define	CMD_CPU_VERSIONED(cpu)	((int)(cpu)->cpu_version < 0)
340 
341 #define	CMD_CPU_F_DELETING	0x1
342 
343 typedef struct cmd_cpu_0 {
344 	cmd_header_t cpu0_header;	/* Nodetype must be CMD_NT_CPU */
345 	uint32_t cpu0_cpuid;		/* Logical ID for this CPU */
346 	cmd_cpu_type_t cpu0_type;	/* CPU model */
347 	fmd_case_t *cpu0_cases[4];	/* v0 had embedded case_t w/4 cases */
348 	uint8_t cpu0_faulting;		/* Set if fault has been issued */
349 	cmd_fmri_t cpu0_asru;		/* ASRU for this CPU */
350 	cmd_fmri_t cpu0_fru;		/* FRU for this CPU */
351 	cmd_cpu_uec_t cpu0_uec;		/* UE cache */
352 	cmd_cpu_uec_t cpu0_olduec;	/* To-be-flushed UE cache */
353 	id_t cpu0_uec_flush;		/* Timer ID for UE cache flush */
354 	uint_t cpu0_uec_nflushes;	/* # of flushes since last restart/GC */
355 	cmd_list_t cpu0_xxu_retries;	/* List of pending xxU retries */
356 } cmd_cpu_0_t;
357 
358 typedef struct cmd_cpu_1 {
359 	cmd_header_t cpu1_header;	/* Nodetype must be CMD_NT_CPU */
360 	uint_t cpu1_version;		/* struct version - must follow hdr */
361 	uint32_t cpu1_cpuid;		/* Logical ID for this CPU */
362 	cmd_cpu_type_t cpu1_type;	/* CPU model */
363 	uintptr_t *cpu1_cases;		/* v1 had a pointer to a case array */
364 	uint8_t cpu1_faulting;		/* Set if fault has been issued */
365 	cmd_fmri_t cpu1_asru;		/* ASRU for this CPU */
366 	cmd_fmri_t cpu1_fru;		/* FRU for this CPU */
367 	cmd_cpu_uec_t cpu1_uec;		/* UE cache */
368 	cmd_cpu_uec_t cpu1_olduec;	/* To-be-flushed UE cache */
369 	id_t cpu1_uec_flush;		/* Timer ID for UE cache flush */
370 	uint_t cpu1_uec_nflushes;	/* # of flushes since last restart/GC */
371 	cmd_list_t cpu1_xxu_retries;	/* List of pending xxU retries */
372 } cmd_cpu_1_t;
373 
374 typedef struct cmd_cpu_2 {
375 	cmd_header_t cpu2_header;	/* Nodetype must be CMD_NT_CPU */
376 	uint_t cpu2_version;		/* struct version - must follow hdr */
377 	uint32_t cpu2_cpuid;		/* Logical ID for this CPU */
378 	cmd_cpu_type_t cpu2_type;	/* CPU model */
379 	uint8_t cpu2_faulting;		/* Set if fault has been issued */
380 	cmd_fmri_t cpu2_asru;		/* ASRU for this CPU */
381 	cmd_fmri_t cpu2_fru;		/* FRU for this CPU */
382 	cmd_cpu_uec_t cpu2_uec;		/* UE cache */
383 	cmd_cpu_uec_t cpu2_olduec;	/* To-be-flushed UE cache */
384 } cmd_cpu_2_t;
385 
386 /* Portion of the cpu structure which must be persisted */
387 typedef struct cmd_cpu_pers {
388 	cmd_header_t cpup_header;	/* Nodetype must be CMD_NT_CPU */
389 	uint_t cpup_version;		/* struct version - must follow hdr */
390 	uint32_t cpup_cpuid;		/* Logical ID for this CPU */
391 	cmd_cpu_type_t cpup_type;	/* CPU model */
392 	uint8_t cpup_faulting;		/* Set if fault has been issued */
393 	uint8_t cpup_level;		/* cpu group level - 0 == thread */
394 	cmd_fmri_t cpup_asru;		/* ASRU for this CPU */
395 	cmd_fmri_t cpup_fru;		/* FRU for this CPU */
396 	cmd_cpu_uec_t cpup_uec;		/* UE cache */
397 	cmd_cpu_uec_t cpup_olduec;	/* To-be-flushed UE cache */
398 } cmd_cpu_pers_t;
399 
400 /* Persistent and dynamic CPU data */
401 struct cmd_cpu {
402 	cmd_cpu_pers_t cpu_pers;
403 	cmd_cpu_cases_t cpu_cases;
404 	id_t cpu_uec_flush;		/* Timer ID for UE cache flush */
405 	uint_t cpu_uec_nflushes;	/* # of flushes since last restart/GC */
406 	cmd_list_t cpu_xxu_retries;	/* List of pending xxU retries */
407 	uint_t cpu_flags;
408 	cmd_list_t cpu_Lxcaches;	/* List of Lxcache state structures */
409 	fmd_stat_t Lxcache_creat;	/* num of Lxcache states created */
410 };
411 
412 #define	CMD_CPU_MAXSIZE \
413 	MAX(MAX(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
414 	    MAX(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
415 #define	CMD_CPU_MINSIZE \
416 	MIN(MIN(sizeof (cmd_cpu_0_t), sizeof (cmd_cpu_1_t)), \
417 	    MIN(sizeof (cmd_cpu_2_t), sizeof (cmd_cpu_pers_t)))
418 
419 #define	cpu_header		cpu_pers.cpup_header
420 #define	cpu_nodetype		cpu_pers.cpup_header.hdr_nodetype
421 #define	cpu_bufname		cpu_pers.cpup_header.hdr_bufname
422 #define	cpu_version		cpu_pers.cpup_version
423 #define	cpu_cpuid		cpu_pers.cpup_cpuid
424 #define	cpu_type		cpu_pers.cpup_type
425 #define	cpu_faulting		cpu_pers.cpup_faulting
426 #define	cpu_level		cpu_pers.cpup_level
427 #define	cpu_asru		cpu_pers.cpup_asru
428 #define	cpu_fru			cpu_pers.cpup_fru
429 #define	cpu_uec			cpu_pers.cpup_uec
430 #define	cpu_olduec		cpu_pers.cpup_olduec
431 #define	cpu_icache		cpu_cases.cpuc_icache
432 #define	cpu_dcache		cpu_cases.cpuc_dcache
433 #define	cpu_pcache		cpu_cases.cpuc_pcache
434 #define	cpu_itlb		cpu_cases.cpuc_itlb
435 #define	cpu_dtlb		cpu_cases.cpuc_dtlb
436 #define	cpu_l2data		cpu_cases.cpuc_l2data
437 #define	cpu_l2tag		cpu_cases.cpuc_l2tag
438 #define	cpu_l3data		cpu_cases.cpuc_l3data
439 #define	cpu_l3tag		cpu_cases.cpuc_l3tag
440 #define	cpu_fpu			cpu_cases.cpuc_fpu
441 #define	cpu_ireg 		cpu_cases.cpuc_ireg
442 #define	cpu_freg		cpu_cases.cpuc_freg
443 #define	cpu_mau			cpu_cases.cpuc_mau
444 #define	cpu_l2ctl		cpu_cases.cpuc_l2ctl
445 #define	cpu_misc_regs		cpu_cases.cpuc_misc_regs
446 #define	cpu_lfu			cpu_cases.cpuc_lfu
447 #ifdef sun4u
448 #define	cpu_opl_invsfsr		cpu_cases.cpuc_opl_invsfsr
449 #define	cpu_oplue_detcpu	cpu_cases.cpuc_oplue_detcpu
450 #define	cpu_oplue_detio		cpu_cases.cpuc_oplue_detio
451 #define	cpu_opl_mtlb		cpu_cases.cpuc_opl_mtlb
452 #define	cpu_opl_tlbp		cpu_cases.cpuc_opl_tlbp
453 #define	cpu_opl_inv_urg		cpu_cases.cpuc_opl_inv_urg
454 #define	cpu_opl_cre		cpu_cases.cpuc_opl_cre
455 #define	cpu_opl_tsb_ctx		cpu_cases.cpuc_opl_tsb_ctx
456 #define	cpu_opl_tsbp		cpu_cases.cpuc_opl_tsbp
457 #define	cpu_opl_pstate		cpu_cases.cpuc_opl_pstate
458 #define	cpu_opl_tstate		cpu_cases.cpuc_opl_tstate
459 #define	cpu_opl_iug_f		cpu_cases.cpuc_opl_iug_f
460 #define	cpu_opl_iug_r		cpu_cases.cpuc_opl_iug_r
461 #define	cpu_opl_sdc		cpu_cases.cpuc_opl_sdc
462 #define	cpu_opl_wdt		cpu_cases.cpuc_opl_wdt
463 #define	cpu_opl_dtlb		cpu_cases.cpuc_opl_dtlb
464 #define	cpu_opl_itlb		cpu_cases.cpuc_opl_itlb
465 #define	cpu_opl_core_err	cpu_cases.cpuc_opl_core_err
466 #define	cpu_opl_dae		cpu_cases.cpuc_opl_dae
467 #define	cpu_opl_iae		cpu_cases.cpuc_opl_iae
468 #define	cpu_opl_uge		cpu_cases.cpuc_opl_uge
469 #endif	/* sun4u */
470 
471 #define	cpu_asru_nvl		cpu_asru.fmri_nvl
472 #define	cpu_fru_nvl		cpu_fru.fmri_nvl
473 
474 /*
475  * L2$ and L3$ Data errors
476  *
477  *          SERD name
478  *   Type   (if any)   Fault
479  *  ------ ----------- -------------------------------
480  *   xxC   l2cachedata fault.cpu.<cputype>.l2cachedata
481  *   xxU        -      fault.cpu.<cputype>.l2cachedata
482  *  L3_xxC l3cachedata fault.cpu.<cputype>.l3cachedata
483  *  L3_xxU      -      fault.cpu.<cputype>.l3cachedata
484  *
485  * NOTE: For the purposes of the discussion below, xxC and xxU refer to both
486  *       L2$ and L3$ data errors.
487  *
488  * These ereports will be dropped if (among other things) they are side-effects
489  * of UEs (xxUs only) or other xxCs or xxUs.  Whenever UEs are detected, they
490  * are added to a per-CPU cache.  xxUs are then compared to this cache.  If a
491  * xxU's AFAR refers to an address which recently saw a UE, the xxU is dropped,
492  * as it was most likely caused by the UE.  When multiple xxCs and xxUs are seen
493  * with the same ENA, all save one are generally side-effects.  We track these
494  * groups (referred to as trains), matching them against a premade list.  If one
495  * of the trains matches, we drop all but the primary, which is indicated in the
496  * list.
497  *
498  * The expected resolution of l2cachedata and l3cachedata faults is the
499  * disabling of the indicated CPU.
500  */
501 extern cmd_evdisp_t cmd_xxc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
502     const char *, cmd_errcl_t);
503 extern cmd_evdisp_t cmd_xxu(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
504     const char *, cmd_errcl_t);
505 
506 /*
507  * As of Niagara-2, we ignore writeback (ldwc, ldwu) errors.  Since these were
508  * the only defined follow-on errors for sun4v trains, sun4v L2 cache data
509  * errors no longer need to use the train mechanism.
510  */
511 
512 extern cmd_evdisp_t cmd_l2c(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
513     const char *, cmd_errcl_t);
514 extern cmd_evdisp_t cmd_l2u(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
515     const char *, cmd_errcl_t);
516 
517 /*
518  * Common Errdata structure for SERD engines
519  */
520 typedef struct errdata {
521 	cmd_serd_t *ed_serd;
522 	const char *ed_fltnm;
523 	const cmd_ptrsubtype_t ed_pst;
524 } errdata_t;
525 
526 /*
527  * L2$ and L3$ Tag errors
528  *
529  *           SERD name
530  *   Type    (if any)   Fault
531  *  ------- ----------- -------------------------------
532  *   TxCE   l2cachetag  fault.cpu.<cputype>.l2cachetag
533  *  L3_THCE l3cachetag  fault.cpu.<cputype>.l3cachetag
534  *    LTC   l2cachetag	fault.cpu.<cputype>.l2cachetag
535  *
536  * We'll never see the uncorrectable Tag errors - they'll cause the machine to
537  * reset, and we'll be ne'er the wiser.
538  *
539  * The expected resolution of l2cachetag and l3cachetag faults is the disabling
540  * of the indicated CPU.
541  */
542 extern cmd_evdisp_t cmd_txce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
543     const char *, cmd_errcl_t);
544 
545 extern cmd_evdisp_t cmd_l3_thce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
546     const char *, cmd_errcl_t);
547 
548 /*
549  * L1$ errors
550  *
551  *          SERD name
552  *   Type   (if any)   Fault
553  *  ------- --------- -------------------------------
554  *   IPE     icache   fault.cpu.<cputype>.icache
555  *   IxSPE   icache   fault.cpu.<cputype>.icache
556  *   DPE     dcache   fault.cpu.<cputype>.dcache
557  *   DxSPE   dcache   fault.cpu.<cputype>.dcache
558  *   PDSPE   pcache   fault.cpu.<cputype>.pcache
559  *
560  * The I$, D$, and P$ are clean, and thus have no uncorrectable errors.
561  *
562  * The expected resolution of icache, dcache, and pcache faults is the disabling
563  * of the indicated CPU.
564  */
565 extern cmd_evdisp_t cmd_icache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
566     const char *, cmd_errcl_t);
567 extern cmd_evdisp_t cmd_dcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
568     const char *, cmd_errcl_t);
569 extern cmd_evdisp_t cmd_pcache(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
570     const char *, cmd_errcl_t);
571 
572 /*
573  * TLB errors
574  *
575  *         SERD name
576  *   Type  (if any)   Fault
577  *  ------ --------- -------------------------------
578  *  ITLBPE   itlb    fault.cpu.<cputype>.itlb
579  *  DTLBPE   dtlb    fault.cpu.<cputype>.dtlb
580  *
581  * The expected resolution of itlb and dtlb faults is the disabling of the
582  * indicated CPU.
583  */
584 extern cmd_evdisp_t cmd_itlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
585     const char *, cmd_errcl_t);
586 extern cmd_evdisp_t cmd_dtlb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
587     const char *, cmd_errcl_t);
588 
589 extern void cmd_cpuerr_close(fmd_hdl_t *, void *);
590 
591 /*
592  * FPU errors
593  *
594  *         SERD name
595  *   Type  (if any)   Fault
596  *  ------ --------- -------------------------------
597  *   FPU       -     fault.cpu.<cputype>.fpu
598  *
599  * The expected resolution of FPU faults is the disabling of the indicated CPU.
600  */
601 extern cmd_evdisp_t cmd_fpu(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
602     const char *, cmd_errcl_t);
603 
604 
605 /*
606  * ireg errors
607  *
608  *         SERD name
609  *   Type  (if any)   Fault
610  *  ------ --------- -------------------------------
611  *   IRC     ireg    fault.cpu.<cputype>.ireg
612  *   IRU      -				 "
613  *
614  * The expected resolution of ireg faults is the disabling of the indicated CPU.
615  */
616 extern cmd_evdisp_t cmd_irc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
617     const char *, cmd_errcl_t);
618 extern cmd_evdisp_t cmd_iru(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
619     const char *, cmd_errcl_t);
620 
621 /*
622  * freg errors
623  *
624  *         SERD name
625  *   Type  (if any)   Fault
626  *  ------ --------- -------------------------------
627  *   FRC     freg    fault.cpu.ultraSPARC-T1.frc
628  *   FRU      -                           " .fru
629  *
630  * The expected resolution of freg faults is the repair of the indicated CPU.
631  */
632 extern cmd_evdisp_t cmd_frc(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
633     const char *, cmd_errcl_t);
634 extern cmd_evdisp_t cmd_fru(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
635     const char *, cmd_errcl_t);
636 
637 /*
638  * MAU errors
639  *
640  *         SERD name
641  *   Type  (if any)   Fault
642  *  ------ --------- -------------------------------
643  *   MAU     mau    fault.cpu.<cputype>.mau
644  *
645  * The expected resolution of mau faults is the repair of the indicated CPU.
646  */
647 extern cmd_evdisp_t cmd_mau(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
648     const char *, cmd_errcl_t);
649 
650 /*
651  * L2CTL errors
652  *
653  *         SERD name
654  *   Type  (if any)   Fault
655  *  ------ --------- -------------------------------
656  *  L2CTL     -     fault.cpu.<cputype>.l2ctl
657  *
658  * The expected resolution of l2ctl faults is the repair of the indicated CPU.
659  */
660 extern cmd_evdisp_t cmd_l2ctl(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
661     const char *, cmd_errcl_t);
662 
663 /*
664  * SBD (Storage Buffer Data) errors
665  * SCA (Scratchpath Array) erros
666  * TC (Tick compare) errors
667  * TSA (Trap stack Array) errors
668  *
669  *         SERD name
670  *   Type  (if any)   Fault
671  *  ------ --------- -------------------------------
672  *   SBDC     misc_regs    fault.cpu.<cputype>.misc_regs
673  *   SBDU
674  *   SCAC, SCAU
675  *   TCC, TCU
676  *   TSAC, TSAU
677  *
678  * The expected resolution of misc_regs faults is the repair of
679  * the indicated CPU.
680  */
681 extern cmd_evdisp_t cmd_miscregs_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
682     const char *, cmd_errcl_t);
683 extern cmd_evdisp_t cmd_miscregs_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
684     const char *, cmd_errcl_t);
685 
686 extern cmd_evdisp_t cmd_miscregs_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
687     const char *, cmd_errcl_t);
688 
689 /*
690  * Type                                          Fault
691  * ---------------------------------------------------------------------
692  * LFU-RTF   uncorrectable link retrain fail error    fault.cpu.T2plus.lfu-u
693  * LFU-TTO   uncorrectable training timeout error
694  * LFU-CTO   uncorrectable config timeout error
695  * LFU-MLF   uncorrectable multi lanes link fail error
696  * LFU-SLF   correctable single lane failover	      fault.cpu.T2plus.lfu-f
697  *
698  * The expected resolution of lfu faults is the repair of the indicated CPU.
699  */
700 extern cmd_evdisp_t cmd_lfu_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
701     const char *, cmd_errcl_t);
702 extern cmd_evdisp_t cmd_lfu_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
703     const char *, cmd_errcl_t);
704 /*
705  * Type                                          Fault
706  * ---------------------------------------------------------------------
707  * Coherency link protocol errors
708  * to        Transaction timed out  		fault.cpu.T2plus.lfu-p
709  * frack     Invalid or redundant request ack
710  * fsr       Invalid or redundant snoop response
711  * fdr       Invalid or redundant data return
712  * snptyp    Invalid snoop type received from
713  *           coherency link
714  *
715  * The expected resolution of lfu faults is the repair of the indicated CPU.
716  */
717 extern cmd_evdisp_t cmd_lfu_pe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
718     const char *, cmd_errcl_t);
719 
720 /*
721  * CPUs are described by FMRIs.  This routine will retrieve the CPU state
722  * structure (creating a new one if necessary) described by the detector
723  * FMRI in the passed ereport.
724  */
725 extern cmd_cpu_t *cmd_cpu_lookup_from_detector(fmd_hdl_t *, nvlist_t *,
726     const char *, uint8_t);
727 
728 extern char *cmd_cpu_getfrustr(fmd_hdl_t *, cmd_cpu_t *);
729 extern char *cmd_cpu_getpartstr(fmd_hdl_t *, cmd_cpu_t *);
730 
731 extern char *cmd_cpu_getserialstr(fmd_hdl_t *, cmd_cpu_t *);
732 extern nvlist_t *cmd_cpu_mkfru(fmd_hdl_t *, char *, char *, char *);
733 
734 extern cmd_cpu_t *cmd_cpu_lookup(fmd_hdl_t *, nvlist_t *, const char *,
735     uint8_t);
736 
737 extern void cmd_cpu_create_faultlist(fmd_hdl_t *, fmd_case_t *, cmd_cpu_t *,
738     const char *, nvlist_t *, uint_t);
739 
740 extern cmd_cpu_t *cmd_restore_cpu_only(fmd_hdl_t *, fmd_case_t *, char *);
741 extern void cmd_cpu_destroy(fmd_hdl_t *, cmd_cpu_t *);
742 extern void *cmd_cpu_restore(fmd_hdl_t *, fmd_case_t *, cmd_case_ptr_t *);
743 extern void cmd_cpu_validate(fmd_hdl_t *);
744 extern void cmd_cpu_timeout(fmd_hdl_t *, id_t, void *);
745 extern void cmd_cpu_gc(fmd_hdl_t *);
746 extern void cmd_cpu_fini(fmd_hdl_t *hdl);
747 extern char *cmd_cpu_serdnm_create(fmd_hdl_t *, cmd_cpu_t *, const char *);
748 extern nvlist_t *cmd_cpu_fmri_create(uint32_t, uint8_t);
749 
750 extern uint32_t cmd_cpu2core(uint32_t, cmd_cpu_type_t, uint8_t);
751 
752 #define	CMD_CPU_LEVEL_THREAD		0
753 #define	CMD_CPU_LEVEL_CORE		1
754 #define	CMD_CPU_LEVEL_CHIP		2
755 #define	CMD_CPU_STAT_BUMP(cpu, name)    cpu->name.fmds_value.ui64++
756 
757 typedef enum {
758     CMD_CPU_FAM_UNSUPPORTED,
759     CMD_CPU_FAM_CHEETAH,
760     CMD_CPU_FAM_NIAGARA,
761     CMD_CPU_FAM_SPARC64
762 } cpu_family_t;
763 
764 typedef struct faminfo {
765 	cpu_family_t fam_value;
766 	boolean_t ecache_flush_needed;
767 } faminfo_t;
768 
769 extern cpu_family_t cmd_cpu_check_support(void);
770 extern boolean_t cmd_cpu_ecache_support(void);
771 
772 extern int cmd_xr_fill(fmd_hdl_t *, nvlist_t *, cmd_xr_t *, cmd_errcl_t);
773 extern void cmd_fill_errdata(cmd_errcl_t, cmd_cpu_t *, cmd_case_t **,
774     const errdata_t **);
775 extern cmd_xxcu_trw_t *cmd_trw_lookup(uint64_t, uint8_t, uint64_t);
776 extern cmd_evdisp_t cmd_nop_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
777     const char *, cmd_errcl_t);
778 extern cmd_errcl_t cmd_train_match(cmd_errcl_t, cmd_errcl_t);
779 extern int cmd_afar_status_check(uint8_t, cmd_errcl_t);
780 
781 #ifdef sun4u
782 extern int cmd_cpu_synd_check(uint16_t, cmd_errcl_t clcode);
783 #else /* sun4u */
784 extern int cmd_cpu_synd_check(uint32_t, cmd_errcl_t clcode);
785 #endif /* sun4u */
786 
787 extern int cmd_afar_valid(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_errcl_t,
788     uint64_t *afar);
789 
790 #ifdef __cplusplus
791 }
792 #endif
793 
794 #endif /* _CMD_CPU_H */
795