xref: /titanic_44/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_memerr_arch.c (revision d4ac42a1cd3016618a9ba0330862d410f0058f89)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Ereport-handling routines for memory errors
28  */
29 
30 #include <cmd_mem.h>
31 #include <cmd_dimm.h>
32 #include <cmd_bank.h>
33 #include <cmd_page.h>
34 #include <cmd_cpu.h>
35 #include <cmd.h>
36 
37 #include <strings.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <limits.h>
41 #include <unistd.h>
42 #include <fm/fmd_api.h>
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/cpu/UltraSPARC-III.h>
45 #include <sys/async.h>
46 #include <sys/cheetahregs.h>
47 #include <sys/errclassify.h>
48 #include <sys/fm/io/sun4upci.h>
49 #include <sys/pci/pcisch.h>
50 
51 /* Jalapeno-specific values from cheetahregs.h */
52 #define	USIIIi_AFSR_AID		0x0000000000003e00ull /* AID causing UE/CE */
53 #define	USIIIi_AFSR_AID_SHIFT	9
54 #define	USIIIi_AFSR_JREQ	0x0000000007000000ull /* Active JBus req */
55 #define	USIIIi_AFSR_JREQ_SHIFT	24
56 #define	TOM_AID_MATCH_MASK	0xe
57 
58 #define	FIRE_AID		0xe
59 #define	FIRE_JBC_ADDR_MASK	0x000007ffffffffffull
60 #define	FIRE_JBC_JITEL1		"jbc-jitel1"
61 
62 /*ARGSUSED*/
63 cmd_evdisp_t
cmd_mem_synd_check(fmd_hdl_t * hdl,uint64_t afar,uint8_t afar_status,uint16_t synd,uint8_t synd_status,cmd_cpu_t * cpu)64 cmd_mem_synd_check(fmd_hdl_t *hdl, uint64_t afar, uint8_t afar_status,
65     uint16_t synd, uint8_t synd_status, cmd_cpu_t *cpu)
66 {
67 	if (synd == CH_POISON_SYND_FROM_XXU_WRITE ||
68 	    ((cpu->cpu_type == CPU_ULTRASPARC_IIIi ||
69 	    cpu->cpu_type == CPU_ULTRASPARC_IIIiplus) &&
70 	    synd == CH_POISON_SYND_FROM_XXU_WRMERGE)) {
71 		fmd_hdl_debug(hdl,
72 		    "discarding UE due to magic syndrome %x\n", synd);
73 		return (CMD_EVD_UNUSED);
74 	}
75 	return (CMD_EVD_OK);
76 }
77 
78 static cmd_evdisp_t
xe_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_xe_handler_f * hdlr)79 xe_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
80     const char *class, cmd_xe_handler_f *hdlr)
81 {
82 	uint64_t afar;
83 	uint16_t synd;
84 	uint8_t afar_status, synd_status;
85 	nvlist_t *rsrc;
86 	char *typenm;
87 	uint64_t disp;
88 	int minorvers = 1;
89 
90 	if (nvlist_lookup_pairs(nvl, 0,
91 	    FM_EREPORT_PAYLOAD_NAME_AFAR, DATA_TYPE_UINT64, &afar,
92 	    FM_EREPORT_PAYLOAD_NAME_AFAR_STATUS, DATA_TYPE_UINT8, &afar_status,
93 	    FM_EREPORT_PAYLOAD_NAME_SYND, DATA_TYPE_UINT16, &synd,
94 	    FM_EREPORT_PAYLOAD_NAME_SYND_STATUS, DATA_TYPE_UINT8, &synd_status,
95 	    FM_EREPORT_PAYLOAD_NAME_ERR_TYPE, DATA_TYPE_STRING, &typenm,
96 	    FM_EREPORT_PAYLOAD_NAME_RESOURCE, DATA_TYPE_NVLIST, &rsrc,
97 	    NULL) != 0)
98 		return (CMD_EVD_BAD);
99 
100 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_ERR_DISP,
101 	    &disp) != 0)
102 		minorvers = 0;
103 
104 	return (hdlr(hdl, ep, nvl, class, afar, afar_status, synd,
105 	    synd_status, cmd_mem_name2type(typenm, minorvers), disp, rsrc));
106 }
107 
108 /*ARGSUSED*/
109 cmd_evdisp_t
cmd_ce(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)110 cmd_ce(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
111     cmd_errcl_t clcode)
112 {
113 	return (xe_common(hdl, ep, nvl, class, cmd_ce_common));
114 }
115 
116 /*ARGSUSED*/
117 cmd_evdisp_t
cmd_ue(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)118 cmd_ue(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
119     cmd_errcl_t clcode)
120 {
121 	return (xe_common(hdl, ep, nvl, class, cmd_ue_common));
122 }
123 
124 cmd_evdisp_t
cmd_frx(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)125 cmd_frx(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
126     cmd_errcl_t clcode)
127 {
128 	cmd_errcl_t matchmask = (clcode == CMD_ERRCL_FRC ? (CMD_ERRCL_RCE |
129 	    CMD_ERRCL_IOCE) : (CMD_ERRCL_RUE | CMD_ERRCL_IOUE));
130 
131 	return (cmd_rxefrx_common(hdl, ep, nvl, class, clcode, matchmask));
132 }
133 
134 /*
135  * When we complete an IOxE/RxE FRx pair, we have enough information to
136  * create either a CE or a UE, as appropriate.  Before dispatching the
137  * joined event to the xE handler, we need to generate the FMRI for the
138  * named DIMM.  While one of the events may already contain a resource FMRI,
139  * said FMRI is incomplete.  The detector didn't have the necessary
140  * information (the AFAR, the AFSR, *and* the syndrome) needed to create
141  * a DIMM-level FMRI.
142  */
143 static cmd_evdisp_t
iorxefrx_synthesize(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,uint64_t afar,uint8_t afar_status,uint64_t afsr,uint16_t synd,uint8_t synd_status,ce_dispact_t type,uint64_t disp,cmd_xe_handler_f * hdlr)144 iorxefrx_synthesize(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
145     const char *class, uint64_t afar, uint8_t afar_status, uint64_t afsr,
146     uint16_t synd, uint8_t synd_status, ce_dispact_t type, uint64_t disp,
147     cmd_xe_handler_f *hdlr)
148 {
149 	nvlist_t *fmri;
150 	int rc;
151 
152 	if ((fmri = cmd_dimm_fmri_derive(hdl, afar, synd, afsr)) == NULL)
153 		return (CMD_EVD_UNUSED);
154 
155 	rc = hdlr(hdl, ep, nvl, class, afar, afar_status, synd, synd_status,
156 	    type, disp, fmri);
157 
158 	nvlist_free(fmri);
159 
160 	return (rc);
161 }
162 
163 static cmd_iorxefrx_t *
iorxefrx_match(fmd_hdl_t * hdl,cmd_errcl_t errcl,cmd_errcl_t matchmask,uint_t det_agentid,uint_t afsr_agentid)164 iorxefrx_match(fmd_hdl_t *hdl, cmd_errcl_t errcl, cmd_errcl_t matchmask,
165     uint_t det_agentid, uint_t afsr_agentid)
166 {
167 	cmd_iorxefrx_t *rf;
168 
169 	for (rf = cmd_list_next(&cmd.cmd_iorxefrx); rf != NULL;
170 	    rf = cmd_list_next(rf)) {
171 
172 		fmd_hdl_debug(hdl, "rf->rf_errcl = %llx, matchmask = %llx\n"
173 		    "rf->rf_det_agentid = %lx, afsr_agentid = %lx\n"
174 		    "rf->rf_afsr_agentid = %lx, det_agentid = %lx\n",
175 		    rf->rf_errcl, matchmask, rf->rf_det_agentid, afsr_agentid,
176 		    rf->rf_afsr_agentid, det_agentid);
177 
178 		if ((rf->rf_errcl & matchmask) == 0)
179 			continue;
180 
181 		/*
182 		 * For IOxEs we are unable to match based on both the detector
183 		 * and the captured Agent Id in the AFSR, because the bridge
184 		 * captures it's own Agent Id instead of the remote CPUs.
185 		 *
186 		 * Also, the LSB of Tomatillo's jpid is aliased for each chip
187 		 * and therefore needs to be factored out of our matching.
188 		 */
189 		if ((CMD_ERRCL_ISIOXE(rf->rf_errcl) ||
190 		    CMD_ERRCL_ISIOXE(errcl)) &&
191 		    ((rf->rf_afsr_agentid & TOM_AID_MATCH_MASK) ==
192 		    (afsr_agentid & TOM_AID_MATCH_MASK)))
193 			return (rf);
194 
195 		/*
196 		 * Check for both here since IOxE is not involved
197 		 */
198 		if ((rf->rf_afsr_agentid == det_agentid) &&
199 		    (rf->rf_det_agentid == afsr_agentid))
200 			return (rf);
201 	}
202 
203 	return (NULL);
204 }
205 
206 /*
207  * Got an RxE or an FRx.  FRx ereports can be matched with RxE ereports and
208  * vice versa.  FRx ereports can also be matched with IOxE ereports.
209  */
210 cmd_evdisp_t
cmd_rxefrx_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode,cmd_errcl_t matchmask)211 cmd_rxefrx_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
212     const char *class, cmd_errcl_t clcode, cmd_errcl_t matchmask)
213 {
214 	cmd_xe_handler_f *hdlr;
215 	cmd_iorxefrx_t *rfmatch, *rferr;
216 	cmd_cpu_t *cpu;
217 	char *typenm;
218 	int isrxe = CMD_ERRCL_MATCH(clcode, CMD_ERRCL_RCE | CMD_ERRCL_RUE);
219 	int isce = CMD_ERRCL_MATCH(clcode, CMD_ERRCL_RCE | CMD_ERRCL_FRC);
220 	int rc;
221 	int minorvers = 1;
222 	uint8_t level = clcode & CMD_ERRCL_LEVEL_EXTRACT;
223 
224 	clcode &= CMD_ERRCL_LEVEL_MASK;
225 	rferr = fmd_hdl_zalloc(hdl, sizeof (cmd_iorxefrx_t), FMD_SLEEP);
226 
227 	if (nvlist_lookup_pairs(nvl, 0,
228 	    FM_EREPORT_PAYLOAD_NAME_SYND, DATA_TYPE_UINT16, &rferr->rf_synd,
229 	    FM_EREPORT_PAYLOAD_NAME_SYND_STATUS, DATA_TYPE_UINT8,
230 	    &rferr->rf_synd_status,
231 	    FM_EREPORT_PAYLOAD_NAME_AFAR, DATA_TYPE_UINT64, &rferr->rf_afar,
232 	    FM_EREPORT_PAYLOAD_NAME_AFAR_STATUS, DATA_TYPE_UINT8,
233 	    &rferr->rf_afar_status,
234 	    FM_EREPORT_PAYLOAD_NAME_AFSR, DATA_TYPE_UINT64, &rferr->rf_afsr,
235 	    FM_EREPORT_PAYLOAD_NAME_ERR_TYPE, DATA_TYPE_STRING, &typenm,
236 	    NULL) != 0) {
237 		fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
238 		return (CMD_EVD_BAD);
239 	}
240 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_ERR_DISP,
241 	    &rferr->rf_disp) != 0)
242 		minorvers = 0;
243 
244 	rferr->rf_type = cmd_mem_name2type(typenm, minorvers);
245 
246 	if ((cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
247 	    level)) == NULL) {
248 		fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
249 		return (CMD_EVD_UNUSED);
250 	}
251 
252 	if (!isrxe && rferr->rf_synd_status != AFLT_STAT_VALID) {
253 		fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
254 		return (CMD_EVD_UNUSED);
255 	}
256 
257 	if (isrxe) {
258 		rferr->rf_afsr_agentid = (rferr->rf_afsr &
259 		    USIIIi_AFSR_JREQ) >> USIIIi_AFSR_JREQ_SHIFT;
260 	} else {
261 		rferr->rf_afsr_agentid = (rferr->rf_afsr &
262 		    USIIIi_AFSR_AID) >> USIIIi_AFSR_AID_SHIFT;
263 	}
264 
265 	rferr->rf_errcl = clcode;
266 	rferr->rf_det_agentid = cpu->cpu_cpuid;
267 
268 	if ((rfmatch = iorxefrx_match(hdl, clcode, matchmask,
269 	    rferr->rf_det_agentid, rferr->rf_afsr_agentid)) == NULL) {
270 		cmd_iorxefrx_queue(hdl, rferr);
271 		return (CMD_EVD_OK);
272 	}
273 
274 	/*
275 	 * Found a match.  Send a synthesized ereport to the appropriate
276 	 * routine.
277 	 */
278 	fmd_hdl_debug(hdl, "matched %cE %llx with %llx", "UC"[isce],
279 	    rferr->rf_errcl, rfmatch->rf_errcl);
280 
281 	hdlr = (isce ? cmd_ce_common : cmd_ue_common);
282 	if (isrxe) {
283 		rc = iorxefrx_synthesize(hdl, ep, nvl, class, rferr->rf_afar,
284 		    rferr->rf_afar_status, rfmatch->rf_afsr, rfmatch->rf_synd,
285 		    rfmatch->rf_synd_status, rferr->rf_type, rferr->rf_disp,
286 		    hdlr);
287 	} else {
288 		rc = iorxefrx_synthesize(hdl, ep, nvl, class, rfmatch->rf_afar,
289 		    rfmatch->rf_afar_status, rferr->rf_afsr, rferr->rf_synd,
290 		    rferr->rf_synd_status, rfmatch->rf_type, rferr->rf_disp,
291 		    hdlr);
292 	}
293 
294 	cmd_iorxefrx_free(hdl, rfmatch);
295 	fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
296 
297 	return (rc);
298 }
299 
300 /*
301  * This fire IOxE must be matched with an FRx before UE/CE processing
302  * is possible.
303  *
304  * Note that for fire ereports we don't receive AFSR, AFAR, AFAR-Status
305  * and SYND values but we can derive the AFAR from the payload value
306  * FIRE_JBC_JITEL1.  We may receive a TYPNM value.
307  */
308 static cmd_evdisp_t
cmd_ioxefrx_fire(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t errcl,cmd_errcl_t matchmask)309 cmd_ioxefrx_fire(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
310     const char *class, cmd_errcl_t errcl, cmd_errcl_t matchmask)
311 {
312 	cmd_xe_handler_f *hdlr;
313 	cmd_iorxefrx_t *rfmatch, *rferr;
314 	uint64_t afar;
315 	int isce = CMD_ERRCL_MATCH(errcl, CMD_ERRCL_IOCE);
316 	char *portid_str;
317 	char *path = NULL;
318 	char *typenm = NULL;
319 	nvlist_t *det = NULL;
320 	int rc;
321 	int minorvers = 1;
322 
323 	rferr = fmd_hdl_zalloc(hdl, sizeof (cmd_iorxefrx_t), FMD_SLEEP);
324 
325 	/*
326 	 * Lookup device path of host bridge.
327 	 */
328 	(void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det);
329 	(void) nvlist_lookup_string(det, FM_FMRI_DEV_PATH, &path);
330 
331 	/*
332 	 * get Jbus port id from the device path
333 	 */
334 	portid_str = strrchr(path, '@') + 1;
335 	rferr->rf_det_agentid = strtol(portid_str, NULL, 16);
336 
337 	rferr->rf_errcl = errcl;
338 	rferr->rf_afsr_agentid = FIRE_AID;
339 	rferr->rf_afar_status = AFLT_STAT_VALID;
340 	rferr->rf_synd_status = AFLT_STAT_VALID;
341 
342 	/*
343 	 * Extract the afar from the payload
344 	 */
345 	(void) nvlist_lookup_uint64(nvl, FIRE_JBC_JITEL1, &afar);
346 	rferr->rf_afar = afar & FIRE_JBC_ADDR_MASK;
347 
348 	rferr->rf_afsr = NULL;
349 	rferr->rf_synd = NULL;
350 
351 	if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_NAME_ERR_TYPE,
352 	    &typenm) == 0)
353 		rferr->rf_type = cmd_mem_name2type(typenm, minorvers);
354 
355 	/*
356 	 * Need to send in the io_jpid that we get from the device path above
357 	 * for both the det_agentid and the afsr_agentid, since the CPU does not
358 	 * capture the same address as the bridge.  The bridge has the LSB
359 	 * aliased and the CPU is missing the MSB.
360 	 */
361 	if ((rfmatch = iorxefrx_match(hdl, rferr->rf_errcl, matchmask,
362 	    rferr->rf_det_agentid, rferr->rf_afsr_agentid)) == NULL) {
363 		cmd_iorxefrx_queue(hdl, rferr);
364 		return (CMD_EVD_OK);
365 		}
366 
367 	/* Found a match.  Synthesize an ereport for UE/CE processing. */
368 	fmd_hdl_debug(hdl, "matched %cE %llx with %llx\n", "UC"[isce],
369 	    rferr->rf_errcl, rfmatch->rf_errcl);
370 
371 	hdlr = (isce ? cmd_ce_common : cmd_ue_common);
372 	rc = iorxefrx_synthesize(hdl, ep, nvl, class, rferr->rf_afar,
373 	    rferr->rf_afar_status, rfmatch->rf_afsr, rfmatch->rf_synd,
374 	    rfmatch->rf_synd_status, rferr->rf_type, rferr->rf_disp, hdlr);
375 
376 	cmd_iorxefrx_free(hdl, rfmatch);
377 	fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
378 
379 	return (rc);
380 }
381 
382 /* This IOxE must be matched with an FRx before UE/CE processing is possible */
383 static cmd_evdisp_t
cmd_ioxefrx_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t errcl,cmd_errcl_t matchmask)384 cmd_ioxefrx_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
385     const char *class, cmd_errcl_t errcl, cmd_errcl_t matchmask)
386 {
387 	cmd_xe_handler_f *hdlr;
388 	cmd_iorxefrx_t *rfmatch, *rferr;
389 	char *typenm;
390 	int isce = CMD_ERRCL_MATCH(errcl, CMD_ERRCL_IOCE);
391 	char *portid_str;
392 	char *path = NULL;
393 	nvlist_t *det = NULL;
394 	int rc;
395 	int minorvers = 1;
396 
397 	rferr = fmd_hdl_zalloc(hdl, sizeof (cmd_iorxefrx_t), FMD_SLEEP);
398 
399 	if (nvlist_lookup_pairs(nvl, 0,
400 	    PCI_ECC_AFAR, DATA_TYPE_UINT64, &rferr->rf_afar,
401 	    PCI_ECC_AFSR, DATA_TYPE_UINT64, &rferr->rf_afsr,
402 	    PCI_ECC_SYND, DATA_TYPE_UINT16, &rferr->rf_synd,
403 	    PCI_ECC_TYPE, DATA_TYPE_STRING, &typenm,
404 	    NULL) != 0) {
405 		fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
406 		return (CMD_EVD_BAD);
407 	}
408 
409 	if (nvlist_lookup_uint64(nvl, PCI_ECC_DISP, &rferr->rf_disp) != 0)
410 		minorvers = 0;
411 
412 	rferr->rf_type = cmd_mem_name2type(typenm, minorvers);
413 	rferr->rf_errcl = errcl;
414 
415 	/*
416 	 * Lookup device path of host bridge.
417 	 */
418 	(void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det);
419 	(void) nvlist_lookup_string(det, FM_FMRI_DEV_PATH, &path);
420 
421 	/*
422 	 * get Jbus port id from the device path
423 	 */
424 	portid_str = strrchr(path, '@') + 1;
425 	rferr->rf_det_agentid = strtol(portid_str, NULL, 16);
426 
427 	rferr->rf_afsr_agentid = (rferr->rf_afsr &
428 	    SCHIZO_ECC_UE_AFSR_AGENT_MID) >> SCHIZO_ECC_UE_AFSR_AGENT_MID_SHIFT;
429 
430 	/*
431 	 * Only 4 bits of the Jbus AID are sent on the Jbus.  MSB is the one
432 	 * that is chosen not to make the trip.  This is not in any of the Jbus
433 	 * or Tomatillo documents and was discovered during testing and verified
434 	 * by Jalapeno H/W designer.
435 	 */
436 	rferr->rf_afsr_agentid &= 0xf;
437 	rferr->rf_afar_status = AFLT_STAT_VALID;
438 	rferr->rf_synd_status = AFLT_STAT_VALID;
439 
440 	/*
441 	 * Need to send in the io_jpid that we get from the device path above
442 	 * for both the det_agentid and the afsr_agentid, since the CPU does not
443 	 * capture the same address as the bridge.  The bridge has the LSB
444 	 * aliased and the CPU is missing the MSB.
445 	 */
446 	if ((rfmatch = iorxefrx_match(hdl, rferr->rf_errcl, matchmask,
447 	    rferr->rf_det_agentid, rferr->rf_afsr_agentid)) == NULL) {
448 		cmd_iorxefrx_queue(hdl, rferr);
449 		return (CMD_EVD_OK);
450 	}
451 
452 	/* Found a match.  Synthesize an ereport for UE/CE processing. */
453 	fmd_hdl_debug(hdl, "matched %cE %llx with %llx\n", "UC"[isce],
454 	    rferr->rf_errcl, rfmatch->rf_errcl);
455 
456 	hdlr = (isce ? cmd_ce_common : cmd_ue_common);
457 	rc = iorxefrx_synthesize(hdl, ep, nvl, class, rferr->rf_afar,
458 	    rferr->rf_afar_status, rfmatch->rf_afsr, rfmatch->rf_synd,
459 	    rfmatch->rf_synd_status, rferr->rf_type, rferr->rf_disp, hdlr);
460 
461 	cmd_iorxefrx_free(hdl, rfmatch);
462 	fmd_hdl_free(hdl, rferr, sizeof (cmd_iorxefrx_t));
463 
464 	return (rc);
465 }
466 
467 /* IOxE ereports that don't need matching with FRx ereports */
468 static cmd_evdisp_t
ioxe_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)469 ioxe_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
470     cmd_errcl_t clcode)
471 {
472 	int isce = CMD_ERRCL_MATCH(clcode, CMD_ERRCL_IOCE);
473 	cmd_xe_handler_f *hdlr = isce ? cmd_ce_common : cmd_ue_common;
474 	uint64_t afar;
475 	uint16_t synd;
476 	nvlist_t *rsrc;
477 	char *typenm;
478 	uint64_t disp;
479 	int minorvers = 1;
480 
481 	if (nvlist_lookup_pairs(nvl, 0,
482 	    PCI_ECC_AFAR, DATA_TYPE_UINT64, &afar,
483 	    PCI_ECC_SYND, DATA_TYPE_UINT16, &synd,
484 	    PCI_ECC_TYPE, DATA_TYPE_STRING, &typenm,
485 	    PCI_ECC_RESOURCE, DATA_TYPE_NVLIST, &rsrc,
486 	    NULL) != 0)
487 		return (CMD_EVD_BAD);
488 
489 	if (nvlist_lookup_uint64(nvl, PCI_ECC_DISP, &disp) != 0)
490 		minorvers = 0;
491 
492 	return (hdlr(hdl, ep, nvl, class, afar, AFLT_STAT_VALID, synd,
493 	    AFLT_STAT_VALID, cmd_mem_name2type(typenm, minorvers), disp,
494 	    rsrc));
495 }
496 
497 cmd_evdisp_t
cmd_rxe(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)498 cmd_rxe(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
499     cmd_errcl_t clcode)
500 {
501 	cmd_errcl_t matchmask = (clcode == CMD_ERRCL_RCE ? CMD_ERRCL_FRC :
502 	    CMD_ERRCL_FRU);
503 
504 	return (cmd_rxefrx_common(hdl, ep, nvl, class, clcode, matchmask));
505 }
506 
507 cmd_evdisp_t
cmd_ioxe(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)508 cmd_ioxe(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
509     cmd_errcl_t clcode)
510 {
511 	cmd_errcl_t matchmask = (clcode == CMD_ERRCL_IOCE ? CMD_ERRCL_FRC :
512 	    CMD_ERRCL_FRU);
513 
514 	if (fmd_nvl_class_match(hdl, nvl, "ereport.io.tom.*")) {
515 		return (cmd_ioxefrx_common(hdl, ep, nvl, class, clcode,
516 		    matchmask));
517 	} else  if (fmd_nvl_class_match(hdl, nvl, "ereport.io.fire.*")) {
518 			return (cmd_ioxefrx_fire(hdl, ep, nvl, class, clcode,
519 			    matchmask));
520 	} else
521 		return (ioxe_common(hdl, ep, nvl, class, clcode));
522 }
523 
524 /*ARGSUSED*/
525 cmd_evdisp_t
cmd_ioxe_sec(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)526 cmd_ioxe_sec(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
527     cmd_errcl_t clcode)
528 {
529 	/*
530 	 * Secondary IOxE's can't be used to identify failed or failing
531 	 * resources, as they don't contain enough information.  Ignore them.
532 	 */
533 	return (CMD_EVD_OK);
534 }
535 
536 /*ARGSUSED*/
537 ulong_t
cmd_mem_get_phys_pages(fmd_hdl_t * hdl)538 cmd_mem_get_phys_pages(fmd_hdl_t *hdl)
539 {
540 	return (sysconf(_SC_PHYS_PAGES));
541 }
542 
543 /*
544  * sun4u bit position as function of e_synd,
545  * from JPS1 Implementation Supplement table P-7
546  * Encode bit positions as follows:
547  * 0-127 data bits 0-127
548  * 128-136 check bits 0-8 (Cn = 128+n)
549  * no error or multibit error = -1 (not valid CE)
550  */
551 
552 int esynd2bit [] = {
553 	-1, 128, 129, -1, 130, -1, -1, 47,
554 	131, -1, -1, 53, -1, 41, 29, -1, /* 000-00F */
555 	132, -1, -1, 50, -1, 38, 25, -1,
556 	-1, 33, 24, -1, 11, -1, -1, 16, /* 010-01F */
557 	133, -1, -1, 46, -1, 37, 19, -1,
558 	-1, 31, 32, -1,  7, -1, -1, 10, /* 020-02F */
559 	-1, 40, 13, -1, 59, -1, -1, 66,
560 	-1, -1, -1,  0, -1, 67, 71, -1, /* 030-03F */
561 	134, -1, -1, 43, -1, 36, 18, -1,
562 	-1, 49, 15, -1, 63, -1, -1,  6, /* 040-04F */
563 	-1, 44, 28, -1, -1, -1, -1, 52,
564 	68, -1, -1, 62, -1, -1, -1, -1, /* 050-05F */
565 	-1, 26, 106, -1, 64, -1, -1,  2,
566 	120, -1, -1, -1, -1, -1, -1, -1, /* 060-06F */
567 	116, -1, -1, -1, -1, -1, -1, -1,
568 	-1, 58, 54, -1, -1, -1, -1, -1, /* 070-07F */
569 	135, -1, -1, 42, -1, 35, 17, -1,
570 	-1, 45, 14, -1, 21, -1, -1,  5, /* 080-08F */
571 	-1, 27, -1, -1, 99, -1, -1,  3,
572 	114, -1, -1, 20, -1, -1, -1, -1, /* 090-09F */
573 	-1, 23, 113, -1, 112, -1, -1, 51,
574 	95, -1, -1, -1, -1, -1, -1, -1, /* 0A0-0AF */
575 	103, -1, -1, -1, -1, -1, -1, -1,
576 	-1, 48, -1, -1, 73, -1, -1, -1, /* 0B0-0BF */
577 	-1, 22, 110, -1, 109, -1, -1,  9,
578 	108, -1, -1, -1, -1, -1, -1, -1, /* 0C0-0CF */
579 	102, -1, -1, -1, -1, -1, -1, -1,
580 	-1, -1, -1, -1, -1, -1, -1, -1, /* 0D0-0DF */
581 	98, -1, -1, -1, -1, -1, -1, -1,
582 	-1, -1, -1, -1, -1, -1, -1, -1, /* 0E0-0EF */
583 	-1, -1, -1, -1, -1, -1, -1, -1,
584 	56, -1, -1, -1, -1, -1, -1, -1, /* 0F0-0FF */
585 	136, -1, -1, 39, -1, 34, 105, -1,
586 	-1, 30, 104, -1, 101, -1, -1,  4, /* 100-10F */
587 	-1, -1, 100, -1, 83, -1, -1, 12,
588 	87, -1, -1, 57, -1, -1, -1, -1, /* 110-11F */
589 	-1, 97, 82, -1, 78, -1, -1,  1,
590 	96, -1, -1, -1, -1, -1, -1, -1, /* 120-12F */
591 	94, -1, -1, -1, -1, -1, -1, -1,
592 	-1, -1, 79, -1, 69, -1, -1, -1, /* 130-13F */
593 	-1, 93, 92, -1, 91, -1, -1,  8,
594 	90, -1, -1, -1, -1, -1, -1, -1, /* 140-14F */
595 	89, -1, -1, -1, -1, -1, -1, -1,
596 	-1, -1, -1, -1, -1, -1, -1, -1, /* 150-15F */
597 	86, -1, -1, -1, -1, -1, -1, -1,
598 	-1, -1, -1, -1, -1, -1, -1, -1, /* 160-16F */
599 	-1, -1, -1, -1, -1, -1, -1, -1,
600 	60, -1, -1, -1, -1, -1, -1, -1, /* 170-17F */
601 	-1, 88, 85, -1, 84, -1, -1, 55,
602 	81, -1, -1, -1, -1, -1, -1, -1, /* 180-18F */
603 	77, -1, -1, -1, -1, -1, -1, -1,
604 	-1, -1, -1, -1, -1, -1, -1, -1, /* 190-19F */
605 	74, -1, -1, -1, -1, -1, -1, -1,
606 	-1, -1, -1, -1, -1, -1, -1, -1, /* 1A0-1AF */
607 	-1, 70, 107, -1, 65, -1, -1, -1,
608 	127, -1, -1, -1, -1, -1, -1, -1, /* 1B0-1BF */
609 	80, -1, -1, 72, -1, 119, 118, -1,
610 	-1, 126, 76, -1, 125, -1, -1, -1, /* 1C0-1CF */
611 	-1, 115, 124, -1, 75, -1, -1, -1,
612 	61, -1, -1, -1, -1, -1, -1, -1, /* 1D0-1DF */
613 	-1, 123, 122, -1, 121, -1, -1, -1,
614 	117, -1, -1, -1, -1, -1, -1, -1, /* 1E0-1EF */
615 	111, -1, -1, -1, -1, -1, -1, -1,
616 	-1, -1, -1, -1, -1, -1, -1, -1  /* 1F0-1FF */
617 };
618 
619 int msynd2bit [] = {  /* msynd 0-F */
620 	-1, 140, 141,  -1,
621 	142, -1,  -1, 137,
622 	143, -1,  -1, 138,
623 	-1, 139,  -1,  -1
624 };
625 
626 int
cmd_synd2upos(uint16_t syndrome)627 cmd_synd2upos(uint16_t syndrome) {
628 	return (esynd2bit[syndrome]);
629 }
630 
631 const char *fmd_fmri_get_platform();
632 
633 #define	DP_MAX	25
634 
635 const char *slotname[] = {
636 	"Slot A", "Slot B", "Slot C", "Slot D"};
637 
638 typedef struct fault_info {
639 	uint32_t id;
640 	int count;
641 } fault_info_t;
642 
643 struct plat2id_map {
644 	char *platnm;
645 	int id;
646 } id_plat[] = {
647 	{"SUNW,Sun-Fire-15000",		1},
648 	{"SUNW,Sun-Fire",		2},
649 	{"SUNW,Netra-T12",		2},
650 	{"SUNW,Sun-Fire-480R", 		3},
651 	{"SUNW,Sun-Fire-V490",		3},
652 	{"SUNW,Sun-Fire-V440",		3},
653 	{"SUNW,Sun-Fire-V445",		3},
654 	{"SUNW,Netra-440",		3},
655 	{"SUNW,Sun-Fire-880",		4},
656 	{"SUNW,Sun-Fire-V890",		4},
657 	{NULL,				0}
658 };
659 
660 /*ARGSUSED*/
661 void
cmd_to_hashed_addr(uint64_t * addr,uint64_t afar,const char * class)662 cmd_to_hashed_addr(uint64_t *addr, uint64_t afar, const char *class)
663 {
664 	*addr = afar;
665 }
666 
667 /*ARGSUSED*/
668 int
cmd_same_datapath_dimms(cmd_dimm_t * d1,cmd_dimm_t * d2)669 cmd_same_datapath_dimms(cmd_dimm_t *d1, cmd_dimm_t *d2)
670 {
671 	return (1);
672 }
673 
674 static int
cmd_get_platform()675 cmd_get_platform()
676 {
677 	const char *platname;
678 	int id = -1;
679 	int i;
680 
681 	platname = fmd_fmri_get_platform();
682 	for (i = 0; id_plat[i].platnm != NULL; i++) {
683 		if (strcmp(platname, id_plat[i].platnm) == 0) {
684 			id = id_plat[i].id;
685 			break;
686 		}
687 	}
688 	return (id);
689 }
690 
691 static int
cmd_get_boardid(uint32_t cpuid)692 cmd_get_boardid(uint32_t cpuid)
693 {
694 	int boardid;
695 	int id = cmd_get_platform();
696 
697 	switch (id) {
698 	case 1:
699 		boardid = ((cpuid >> 5) & 0x1f);
700 		break;
701 	case 2:
702 		boardid = ((cpuid & 0x1f) / 4);
703 		break;
704 
705 	case 3:
706 		cpuid = cpuid & 0x07;
707 		boardid = ((cpuid % 2) == 0) ? 0 : 1;
708 		break;
709 	case 4:
710 		cpuid = cpuid & 0x07;
711 		if ((cpuid % 2) == 0)
712 			boardid = (cpuid < 4) ? 0 : 2;
713 		else
714 			boardid = (cpuid < 5) ? 1 : 3;
715 		break;
716 	default:
717 		boardid = 5;
718 		break;
719 	}
720 
721 	return (boardid);
722 }
723 
724 static void
cmd_get_faulted_comp(fmd_hdl_t * hdl,cmd_dimm_t * d1,cmd_dimm_t * d2,uint16_t upos,fault_info_t ** fault_list,int cpu)725 cmd_get_faulted_comp(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
726     uint16_t upos, fault_info_t **fault_list, int cpu)
727 {
728 	cmd_mq_t *ip;
729 	int i, j, k, idj;
730 	uint32_t id;
731 	uint32_t *cpuid = NULL;
732 	int max_rpt;
733 
734 	max_rpt = 2 * cmd.cmd_nupos;
735 
736 	cpuid = fmd_hdl_alloc(hdl, max_rpt * sizeof (uint32_t), FMD_SLEEP);
737 
738 	if (cpuid == NULL)
739 		return;
740 
741 	for (i = 0, j = 0; i < CMD_MAX_CKWDS; i++) {
742 		for (ip = cmd_list_next(&d1->mq_root[i]); ip != NULL;
743 		    ip = cmd_list_next(ip)) {
744 			if (upos == ip->mq_unit_position) {
745 				cpuid[j] = ip->mq_cpuid;
746 				j++;
747 			}
748 			if (j >= cmd.cmd_nupos)
749 				break;
750 		}
751 		if (j >= cmd.cmd_nupos)
752 			break;
753 	}
754 
755 	for (i = 0; i < CMD_MAX_CKWDS; i++) {
756 		for (ip = cmd_list_next(&d2->mq_root[i]); ip != NULL;
757 		    ip = cmd_list_next(ip)) {
758 			if (upos == ip->mq_unit_position) {
759 				cpuid[j] = ip->mq_cpuid;
760 				j++;
761 			}
762 			if (j >= max_rpt)
763 				break;
764 		}
765 		if (j >= max_rpt)
766 			break;
767 	}
768 
769 	for (i = 0, k = 0; i < max_rpt; i++) {
770 		if (cpuid[i] == ULONG_MAX)
771 			continue;
772 		id = (cpu == 0) ? cmd_get_boardid(cpuid[i]) : cpuid[i];
773 		fault_list[k] = fmd_hdl_alloc(hdl,
774 		    sizeof (fault_info_t), FMD_SLEEP);
775 		if (fault_list[k] == NULL)
776 			break;
777 		fault_list[k]->count = 1;
778 		fault_list[k]->id = id;
779 		for (j = i + 1; j < max_rpt; j++) {
780 			if (cpuid[j] == ULONG_MAX)
781 				continue;
782 			idj = (cpu == 0) ? cmd_get_boardid(cpuid[j]) : cpuid[j];
783 			if (id == idj) {
784 				fault_list[k]->count++;
785 				cpuid[j] = ULONG_MAX;
786 			}
787 		}
788 		k++;
789 	}
790 
791 	fmd_hdl_free(hdl, cpuid, max_rpt * sizeof (uint32_t));
792 }
793 
794 /*ARGSUSED*/
795 static nvlist_t *
cmd_board_mkfru(fmd_hdl_t * hdl,char * frustr)796 cmd_board_mkfru(fmd_hdl_t *hdl, char *frustr)
797 {
798 	nvlist_t *hcel, *fru;
799 	int err;
800 
801 	if (frustr == NULL)
802 		return (NULL);
803 
804 	if (nvlist_alloc(&hcel, NV_UNIQUE_NAME, 0) != 0)
805 		return (NULL);
806 
807 	err = nvlist_add_string(hcel, FM_FMRI_HC_NAME,
808 	    FM_FMRI_LEGACY_HC);
809 	err |= nvlist_add_string(hcel, FM_FMRI_HC_ID, frustr);
810 	if (err != 0) {
811 		nvlist_free(hcel);
812 		return (NULL);
813 	}
814 
815 	if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) {
816 		nvlist_free(hcel);
817 		return (NULL);
818 	}
819 	err = nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION);
820 	err |= nvlist_add_string(fru, FM_FMRI_SCHEME,
821 	    FM_FMRI_SCHEME_HC);
822 	err |= nvlist_add_string(fru, FM_FMRI_HC_ROOT, "");
823 	err |= nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1);
824 	err |= nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcel, 1);
825 	if (err != 0) {
826 		nvlist_free(fru);
827 		nvlist_free(hcel);
828 		return (NULL);
829 	}
830 	nvlist_free(hcel);
831 	return (fru);
832 }
833 
834 /*
835  * Startcat, Serengeti, V4xx, and V8xx: fault the system boards of
836  * the detectors in proportion to the number of ereports out of 8
837  * Other systems: fault the detectors in proportion to the number of
838  * ereports out of 8
839  */
840 void
cmd_gen_datapath_fault(fmd_hdl_t * hdl,cmd_dimm_t * d1,cmd_dimm_t * d2,uint16_t upos,nvlist_t * det)841 cmd_gen_datapath_fault(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
842     uint16_t upos, nvlist_t *det)
843 {
844 	char frustr[DP_MAX];
845 	fmd_case_t *cp;
846 	int i, ratio, type, fault_cpu, max_rpt;
847 	uint32_t id;
848 	uint8_t cpumask;
849 	char *cpustr;
850 	fault_info_t **fault_list = NULL;
851 	nvlist_t *fru = NULL, *asru = NULL, *flt = NULL;
852 
853 	max_rpt = cmd.cmd_nupos * 2;
854 	fault_list = fmd_hdl_alloc(hdl,
855 	    max_rpt * sizeof (fault_info_t *), FMD_SLEEP);
856 
857 	if (fault_list == NULL)
858 		return;
859 
860 	for (i = 0; i < max_rpt; i++)
861 		fault_list[i] = NULL;
862 
863 	type = cmd_get_platform();
864 
865 	fault_cpu = (type == -1) ? 1 : 0;
866 
867 	cmd_get_faulted_comp(hdl, d1, d2, upos, fault_list, fault_cpu);
868 
869 	cp = fmd_case_open(hdl, NULL);
870 
871 	for (i = 0; i < max_rpt; i++) {
872 		if (fault_list[i] == NULL)
873 			continue;
874 		id = fault_list[i]->id;
875 
876 		switch (type) {
877 		case 1:
878 			(void) snprintf(frustr, DP_MAX, "EX%d", id);
879 			break;
880 		case 2:
881 			(void) snprintf(frustr, DP_MAX, "/N0/SB%d", id);
882 			break;
883 		case 3:
884 		case 4:
885 			(void) snprintf(frustr, DP_MAX, slotname[id]);
886 			break;
887 		default:
888 			cpustr = cmd_cpu_getfrustr_by_id(hdl, id);
889 			if (nvlist_lookup_uint8(det, FM_FMRI_CPU_MASK, &cpumask)
890 			    == 0) {
891 				asru = cmd_cpu_fmri_create(id, cpumask);
892 				(void) fmd_nvl_fmri_expand(hdl, asru);
893 			}
894 			break;
895 		}
896 
897 		ratio = (fault_list[i]->count * 100) / (cmd.cmd_nupos * 2);
898 
899 		if (fault_cpu) {
900 			fru = cmd_cpu_mkfru(hdl, cpustr, NULL, NULL);
901 			fmd_hdl_strfree(hdl, cpustr);
902 			if (fru == NULL) {
903 				nvlist_free(asru);
904 				break;
905 			}
906 			flt = cmd_nvl_create_fault(hdl, "fault.memory.datapath",
907 			    ratio, asru, fru, asru);
908 			nvlist_free(asru);
909 		} else {
910 			fru = cmd_board_mkfru(hdl, frustr);
911 			if (fru == NULL)
912 				break;
913 			flt = cmd_nvl_create_fault(hdl, "fault.memory.datapath",
914 			    ratio, fru, fru, fru);
915 		}
916 
917 		fmd_case_add_suspect(hdl, cp, flt);
918 
919 		/* free up memory */
920 		nvlist_free(fru);
921 	}
922 
923 	fmd_case_solve(hdl, cp);
924 
925 	for (i = 0; i < max_rpt; i++) {
926 		if (fault_list[i] != NULL)
927 			fmd_hdl_free(hdl, fault_list[i], sizeof (fault_info_t));
928 	}
929 
930 	fmd_hdl_free(hdl, fault_list, sizeof (fault_info_t *) * max_rpt);
931 }
932