xref: /titanic_50/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_mem.h (revision d4ac42a1cd3016618a9ba0330862d410f0058f89)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #ifndef _CMD_MEM_H
26 #define	_CMD_MEM_H
27 
28 /*
29  * Support routines for managing state related to memory modules.
30  *
31  * Correctable errors generally cause changes to the DIMM-related state (see
32  * cmd_dimm.c), whereas uncorrectable errors tend to use the bank-related
33  * routines (see cmd_bank.c).  The primary exception to this division (though
34  * it eventually devolves to one of the two) is the RxE/FRx pair emitted by
35  * UltraSPARC-IIIi processors.  With these errors, a complete pair must be
36  * received and matched before we know whether we're dealing with a CE or a UE.
37  */
38 
39 #include <cmd.h>
40 #include <cmd_state.h>
41 #include <cmd_fmri.h>
42 #include <sys/errclassify.h>
43 #include <cmd_cpu.h>
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 #define	CMD_MEM_F_FAULTING	0x1
50 
51 /*
52  * Used to store as-yet unmatched IOxEs, RxEs, and FRxs.  When a new IOxE,
53  * RxE or FRx arrives, we traverse the cmd.cmd_iorxefrx list, looking for
54  * matching entries.  Matching has a cpuid-based component, as well as a
55  * temporal one.  We can compare the cpuids directly, using the cmd_iorxefrx_t
56  * and the newly-received event. Temporal comparison isn't performed directly.
57  * Instead, we ensure that entries in the iorxefrx list are removed when they
58  * expire by means of timers. This frees the matching code from the need to
59  * worry about time.
60  */
61 typedef struct cmd_iorxefrx {
62 	cmd_list_t rf_list;		/* List of cmd_iorxefrx_t's */
63 	cmd_errcl_t rf_errcl;		/* Error type (CMD_ERRCL_*) */
64 	uint_t rf_afsr_agentid;		/* Remote Agent ID (from AFSR) */
65 	uint_t rf_det_agentid;		/* Locat Agent ID (from detector) */
66 	id_t rf_expid;			/* Timer ID for entry expiration */
67 	uint64_t rf_afar;		/* Valid for RxE only */
68 	uint8_t rf_afar_status;		/* Valid for RxE only */
69 	ce_dispact_t rf_type;		/* Valid for RxE only */
70 	uint16_t rf_synd;		/* Valid for FRx only */
71 	uint8_t rf_synd_status;		/* Valid for FRx only */
72 	uint64_t rf_afsr;		/* Valid for FRx only */
73 	uint64_t rf_disp;		/* Valid for RCE only */
74 } cmd_iorxefrx_t;
75 
76 typedef struct cmd_dimm cmd_dimm_t;
77 typedef struct cmd_bank cmd_bank_t;
78 #ifdef sun4v
79 typedef struct cmd_branch cmd_branch_t;
80 #endif
81 
82 /*
83  * Correctable and Uncorrectable memory errors
84  *
85  * CEs of "Unknown" or "Intermittent" classification are not used in diagnosis.
86  *
87  * "Persistent" CEs are added to per-DIMM SERD engines.  When the
88  * engine for a given DIMM fires, the page corresponding to the CE that
89  * caused the engine to fire is retired, and the SERD engine for that
90  * DIMM is reset.
91  *
92  * "Possibly Persistent" CEs are at least Persistent and so are treated
93  * as "Persistent" errors above, being added to the same SERD engines.
94  *
95  * "Leaky" CEs and "Sticky" CEs trigger immediate page retirement.
96  *
97  * "Possibly Sticky" CEs to which no valid partner test has been applied
98  * are not used in diagnosis.  Where a valid partner test has been applied
99  * but did not confirm "Sticky" status there is a _suggestion_ that the
100  * original cpu may be a bad reader or writer or suffering from other
101  * datapath issues.  To avoid retiring pages for such non-DIMM problems
102  * these classifications are also not used in diagnosis.
103  *
104  * UEs immediately trigger page retirements, but do not affect the CE SERD
105  * engines.  In addition, UEs are recorded in the UE caches of the detecting
106  * CPUs.  When a page is to be retired, a fault.memory.page fault is
107  * generated.
108  *
109  */
110 
111 typedef cmd_evdisp_t cmd_xe_handler_f(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
112     const char *, uint64_t, uint8_t, uint16_t, uint8_t, ce_dispact_t, uint64_t,
113     nvlist_t *);
114 
115 extern ce_dispact_t cmd_mem_name2type(const char *, int);
116 extern int cmd_synd2upos(uint16_t);
117 extern cmd_evdisp_t cmd_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
118     const char *, cmd_errcl_t);
119 extern cmd_evdisp_t cmd_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
120     const char *, cmd_errcl_t);
121 extern cmd_evdisp_t cmd_ce_common(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
122     const char *, uint64_t, uint8_t, uint16_t, uint8_t,
123     ce_dispact_t, uint64_t, nvlist_t *);
124 extern cmd_evdisp_t cmd_ue_common(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
125     const char *, uint64_t, uint8_t, uint16_t, uint8_t,
126     ce_dispact_t, uint64_t, nvlist_t *);
127 extern cmd_evdisp_t cmd_mem_synd_check(fmd_hdl_t *, uint64_t, uint8_t,
128     uint16_t, uint8_t, cmd_cpu_t *);
129 extern void cmd_dimm_close(fmd_hdl_t *, void *);
130 extern void cmd_bank_close(fmd_hdl_t *, void *);
131 extern int cmd_same_datapath_dimms(cmd_dimm_t *, cmd_dimm_t *);
132 extern void cmd_gen_datapath_fault(fmd_hdl_t *, cmd_dimm_t *, cmd_dimm_t *,
133     uint16_t, nvlist_t *);
134 extern void cmd_to_hashed_addr(uint64_t *, uint64_t, const char *);
135 
136 #ifdef sun4u
137 extern char *cmd_cpu_getfrustr_by_id(fmd_hdl_t *, uint32_t);
138 #endif
139 
140 #ifdef sun4v
141 extern void cmd_branch_close(fmd_hdl_t *, void *);
142 extern cmd_evdisp_t cmd_fb(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
143     const char *, cmd_errcl_t);
144 extern cmd_evdisp_t cmd_fw_defect(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
145     const char *, cmd_errcl_t);
146 extern cmd_evdisp_t cmd_fb_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
147     const char *, cmd_errcl_t);
148 extern cmd_evdisp_t cmd_ue_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
149     const char *, cmd_errcl_t);
150 #endif
151 
152 /*
153  * US-IIIi I/O, Remote and Foreign Read memory errors
154  *
155  * When one processor or I/O bridge attempts to read memory local to
156  * another processor, one each of IOCE/IOUE/RCE/RUE and FRC/FRU will be
157  * generated, depending on the type of error.  Both the IOxE/RxE and the FRx
158  * are needed, as each contains data necessary to the diagnosis of the error.
159  * Upon receipt of one of the errors, we wait until we receive the other.
160  * When the pair has been successfully received and matched, a CE or UE,
161  * as appropriate, is synthesized from the data in the matched ereports.
162  * The synthesized ereports are handled by the normal CE and UE mechanisms.
163  */
164 extern cmd_evdisp_t cmd_frx(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
165     const char *, cmd_errcl_t);
166 extern cmd_evdisp_t cmd_rxe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
167     const char *, cmd_errcl_t);
168 extern cmd_evdisp_t cmd_ioxe(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
169     const char *, cmd_errcl_t);
170 extern cmd_evdisp_t cmd_ioxe_sec(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
171     const char *, cmd_errcl_t);
172 extern cmd_evdisp_t cmd_rxefrx_common(fmd_hdl_t *hdl, fmd_event_t *ep,
173     nvlist_t *nvl, const char *class, cmd_errcl_t clcode,
174     cmd_errcl_t matchmask);
175 
176 /*
177  * A list of received IOxE/RxE/FRx ereports is maintained for correlation
178  * purposes (see above).  These two routines manage the addition of new
179  * ereports, and the retrieval of existing ones.  Pruning of the list is
180  * handled automatically.
181  */
182 extern void cmd_iorxefrx_queue(fmd_hdl_t *, cmd_iorxefrx_t *);
183 extern void cmd_iorxefrx_free(fmd_hdl_t *, cmd_iorxefrx_t *);
184 
185 extern const char *cmd_fmri_get_unum(nvlist_t *);
186 extern nvlist_t *cmd_mem_fmri_create(const char *, char **, size_t);
187 extern nvlist_t *cmd_mem_fmri_derive(fmd_hdl_t *, uint64_t, uint64_t, uint16_t);
188 
189 extern void cmd_mem_case_restore(fmd_hdl_t *, cmd_case_t *, fmd_case_t *,
190     const char *, const char *);
191 extern char *cmd_mem_serdnm_create(fmd_hdl_t *, const char *, const char *);
192 extern char *cmd_page_serdnm_create(fmd_hdl_t *, const char *, uint64_t);
193 extern char *cmd_mq_serdnm_create(fmd_hdl_t *, const char *, uint64_t,
194     uint16_t, uint16_t);
195 extern void cmd_mem_retirestat_create(fmd_hdl_t *, fmd_stat_t *, const char *,
196     uint64_t, const char *);
197 extern int cmd_mem_thresh_check(fmd_hdl_t *, uint_t);
198 extern ulong_t cmd_mem_get_phys_pages(fmd_hdl_t *);
199 
200 extern void cmd_mem_timeout(fmd_hdl_t *, id_t);
201 extern void cmd_mem_gc(fmd_hdl_t *);
202 extern void cmd_mem_fini(fmd_hdl_t *);
203 
204 #ifdef __cplusplus
205 }
206 #endif
207 
208 #endif /* _CMD_MEM_H */
209