xref: /titanic_52/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_dperr.c (revision 19d61fc7991644175873937566d932d8cf52912a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Ereport-handling routines for Datapath errors
30  * - receive datapath ereports and open datapath case
31  * - solve datapath case when datapath fault ereports are received
32  * - maintain state of datapath error flag
33  * - close datapath case when timeout occurs (w/o fault)
34  */
35 
36 
37 #include <strings.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <fm/fmd_api.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/async.h>
43 #include <sys/time.h>
44 #include <cmd.h>
45 #include <cmd_state.h>
46 #include <cmd_dp.h>
47 #include <cmd_dp_page.h>
48 #include <cmd_page.h>
49 #include <libnvpair.h>
50 #include <sys/plat_datapath.h>
51 
52 /*
53  * Member Name     Data Type          Comments
54  * -----------     ---------          -----------
55  * version         uint8              0
56  * class           string             "asic"
57  * ENA             uint64             ENA Format 1
58  * detector        fmri               aggregated ID data for SC-DE
59  *
60  * Datapath ereport subclasses and data payloads:
61  * There will be two types of ereports (error and fault) which will be
62  * identified by the "type" member.
63  *
64  * ereport.asic.*.cds.cds-dp
65  * ereport.asic.*.dx.dx-dp
66  * ereport.asic.*.sdi.sdi-dp
67  * ereport.asic.*.cp.cp-dp
68  * ereport.asic.*.rp.rp-dp		// serengeti doesn't use "cp" term
69  *
70  * Member Name     Data Type          Comments
71  * -----------     ---------          -----------
72  * erptype         uint16            derived from message type: error or
73  *                                   fault
74  * t-value         uint32            SC's datapath SERD timeout threshold
75  * dp-list-sz      uint8             number of dp-list array elements
76  * dp-list         array of uint16   Safari IDs of affected cpus
77  */
78 
79 static char *dperrtype[] = {
80 	DP_ERROR_CDS,		/* Starcat types */
81 	DP_ERROR_DX,
82 	DP_ERROR_EX,
83 	DP_ERROR_CP,
84 	DP_ERROR_CDS,		/* Serengeti types */
85 	DP_ERROR_DX,
86 	DP_ERROR_RP
87 };
88 
89 /*
90  * Construct the ASRU(s)/FRU(s) associated with a data path fault,
91  * construct the fault(s), and add the suspect(s) to the case
92  *
93  */
94 void
95 cmd_dp_add_suspects(fmd_hdl_t *hdl, cmd_dp_t *dp)
96 {
97 	const char	*funcname = "cmd_dp_add_suspects()";
98 	char		class[DP_MAX_CLASS];
99 	char		frustr[3][DP_MAX_FRU];
100 	int		cpuid, numfru, sgpos, xcpos, i, err;
101 	nvlist_t	*asru, *fru = NULL, *flt, *hcel;
102 
103 	/* build ASRU, fault event class */
104 	asru = cmd_dp_setasru(hdl, dp);
105 	(void) snprintf(class, DP_MAX_CLASS, "fault.asic.%s.%s",
106 	    dperrtype[dp->dp_err], FM_ERROR_DATAPATH);
107 
108 	cpuid = dp->dp_cpuid_list[0];
109 
110 	/* extract fru position */
111 	sgpos = ((cpuid & 0x1f) / 4);
112 	xcpos = ((cpuid >> 5) & 0x1f);
113 
114 	/* build FRU(s) for the particular error */
115 	numfru = 0;
116 	switch (dp->dp_err) {
117 	case SC_DP_CDS_TYPE:
118 	case SC_DP_DX_TYPE:
119 		/* check for slot 1 (maxcat) */
120 		if ((cpuid >> 3) & 0x1)
121 			(void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos);
122 		else
123 			(void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos);
124 
125 		numfru = 1;
126 		break;
127 
128 	case SC_DP_EX_TYPE:
129 		/* check for slot 1 (maxcat) */
130 		if ((cpuid >> 3) & 0x1)
131 			(void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos);
132 		else
133 			(void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos);
134 
135 		(void) snprintf(frustr[1], DP_MAX_FRU, "EX%d", xcpos);
136 		numfru = 2;
137 		break;
138 
139 	case SC_DP_CP_TYPE:
140 		/* no way to know which CP half, be generic */
141 		(void) snprintf(frustr[0], DP_MAX_FRU, "EX%d", xcpos);
142 		(void) snprintf(frustr[1], DP_MAX_FRU, "CP");
143 		(void) snprintf(frustr[2], DP_MAX_FRU, "CS");
144 		numfru = 3;
145 		break;
146 
147 	case SG_DP_CDS_TYPE:
148 	case SG_DP_DX_TYPE:
149 		(void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos);
150 		numfru = 1;
151 		break;
152 
153 	case SG_DP_RP_TYPE:
154 		/* no way to know which RP, be generic */
155 		(void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos);
156 		(void) snprintf(frustr[1], DP_MAX_FRU, "RP");
157 		numfru = 2;
158 		break;
159 
160 	default:
161 		fmd_hdl_debug(hdl, "%s: invalid DP error type %d", funcname,
162 		    dp->dp_err);
163 		nvlist_free(asru);
164 		return;
165 	}
166 
167 	/* For each FRU, build an FMRI, create fault, add as suspect */
168 	for (i = 0; i < numfru; i++) {
169 		/* build a FRU FMRI */
170 		if (nvlist_alloc(&hcel, NV_UNIQUE_NAME, 0) != 0) {
171 			nvlist_free(asru);
172 			return;
173 		}
174 		err = nvlist_add_string(hcel, FM_FMRI_HC_NAME,
175 		    FM_FMRI_LEGACY_HC);
176 		err |= nvlist_add_string(hcel, FM_FMRI_HC_ID, frustr[i]);
177 		if (err != 0) {
178 			nvlist_free(hcel);
179 			nvlist_free(asru);
180 			return;
181 		}
182 
183 		/* put it in an HC scheme */
184 		if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) {
185 			nvlist_free(hcel);
186 			nvlist_free(asru);
187 			return;
188 		}
189 		err = nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION);
190 		err |= nvlist_add_string(fru, FM_FMRI_SCHEME,
191 		    FM_FMRI_SCHEME_HC);
192 		err |= nvlist_add_string(fru, FM_FMRI_HC_ROOT, "");
193 		err |= nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1);
194 		err |= nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcel, 1);
195 		if (err != 0) {
196 			nvlist_free(fru);
197 			nvlist_free(hcel);
198 			nvlist_free(asru);
199 			return;
200 		}
201 
202 		/* create the fault, add to case. */
203 		flt = cmd_nvl_create_fault(hdl, class, 100/numfru,
204 		    asru, fru, NULL);
205 		fmd_case_add_suspect(hdl, dp->dp_case, flt);
206 
207 		/* free up memory */
208 		nvlist_free(fru);
209 		nvlist_free(hcel);
210 	}
211 
212 	/* free up ASRU */
213 	nvlist_free(asru);
214 }
215 
216 /*ARGSUSED*/
217 cmd_evdisp_t
218 cmd_dp_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
219 	cmd_errcl_t clcode, uint8_t dperr)
220 {
221 	const char	*funcname = "cmd_dp_common()";
222 	const char	*uuidp;
223 	cmd_dp_t	*dpt, *ept;
224 	int 		err, i, fltflg;
225 	uint16_t	*cpuid_list;
226 	uint64_t	*serid_list;
227 	uint32_t	ncpuids;
228 
229 	/* extract common ereport contents */
230 	dpt = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_t), FMD_SLEEP);
231 	dpt->dp_nodetype = CMD_NT_DP;
232 	dpt->dp_version = CMD_DP_VERSION;
233 	dpt->dp_err = dperr;
234 	err = nvlist_lookup_pairs(nvl, 0,
235 	    DP_EREPORT_TYPE, DATA_TYPE_UINT16, &dpt->dp_erpt_type,
236 	    DP_TVALUE, DATA_TYPE_UINT32, &dpt->dp_t_value,
237 	    DP_LIST_SIZE, DATA_TYPE_UINT32, &ncpuids, NULL);
238 	if (err != 0) {
239 		fmd_hdl_debug(hdl, "%s: unable to verify ereport contents "
240 		    "(erptype, ena, t_value, dp_list_sz)", funcname);
241 		fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
242 		return (CMD_EVD_UNUSED);
243 	}
244 
245 	/* extract cpuid list from ereport */
246 	err = nvlist_lookup_uint16_array(nvl, DP_LIST, &cpuid_list,
247 	    &ncpuids);
248 	err |= nvlist_lookup_uint64_array(nvl, SN_LIST, &serid_list,
249 	    &ncpuids);
250 	if (err != 0) {
251 		fmd_hdl_debug(hdl, "%s: unable to verify ereport contents "
252 		    "(dp_list, sn_list)", funcname);
253 		fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
254 		return (CMD_EVD_UNUSED);
255 	}
256 
257 	for (i = 0; i < ncpuids; i++) {
258 		dpt->dp_cpuid_list[i] = cpuid_list[i];
259 		dpt->dp_serid_list[i] = serid_list[i];
260 	}
261 
262 	dpt->dp_ncpus = ncpuids;
263 
264 	switch (dpt->dp_erpt_type) {
265 
266 	case DP_ERROR:
267 
268 		/*
269 		 * Scan existing faults on cmd.cmd_datapaths. If each
270 		 * cpuid in the current datapath event already has an
271 		 * associated DP fault, this is an uninteresting event.
272 		 */
273 		fltflg = 0;
274 		for (i = 0; i < ncpuids; i++)
275 			if (cmd_dp_lookup_fault(hdl, cpuid_list[i]) != NULL)
276 				fltflg++;
277 		if (fltflg == ncpuids) {
278 			fmd_hdl_debug(hdl, "%s: datapath fault(s) already "
279 			    "experienced, event uninteresting\n", funcname);
280 			fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
281 			return (CMD_EVD_UNUSED);
282 		}
283 
284 		/*
285 		 * Check for an existing datapath error, and if found
286 		 * add this event to the existing case
287 		 */
288 		ept = cmd_dp_lookup_error(dpt);
289 		if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) {
290 			fmd_hdl_debug(hdl, "%s: found existing datapath error, "
291 			    "adding event to case\n", funcname);
292 			fmd_case_add_ereport(hdl, ept->dp_case, ep);
293 			/* check for t-value change */
294 			if (dpt->dp_t_value != ept->dp_t_value) {
295 				fmd_event_t *ep;
296 
297 				fmd_timer_remove(hdl, ept->dp_id);
298 				ep = fmd_case_getprincipal(hdl, ept->dp_case);
299 				ept->dp_id = fmd_timer_install(hdl,
300 				    (void *)CMD_TIMERTYPE_DP, ep,
301 				    (hrtime_t)NANOSEC *
302 				    (dpt->dp_t_value + 120));
303 			}
304 			fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
305 			return (CMD_EVD_OK);
306 		}
307 
308 		/*
309 		 * Didn't find an existing datapath error. Create a new
310 		 * case, add the event. Also, stash the datapath event on the
311 		 * cmd.cmd_datapaths list
312 		 */
313 		fmd_hdl_debug(hdl, "%s: new datapath error, create case and "
314 		    "add to cmd.cmd_datapaths\n", funcname);
315 		++cmd.cmd_dp_flag;
316 
317 		cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname),
318 		    "dp_err_%d_%s", dpt->dp_cpuid_list[0],
319 		    dperrtype[dpt->dp_err]);
320 
321 		dp_buf_write(hdl, dpt);
322 
323 		dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header,
324 		    CMD_PTR_DP_CASE, &uuidp);
325 		fmd_case_setprincipal(hdl, dpt->dp_case, ep);
326 		dpt->dp_id = fmd_timer_install(hdl, (void *)CMD_TIMERTYPE_DP,
327 		    ep, (hrtime_t)NANOSEC * (dpt->dp_t_value + 120));
328 		cmd_list_append(&cmd.cmd_datapaths, dpt);
329 		break;
330 
331 	case DP_FAULT:
332 		++cmd.cmd_dp_flag;
333 		dpt->dp_erpt_type = DP_FAULT;
334 		dpt->dp_id = 0;
335 
336 		cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname),
337 		    "dp_flt_%d_%s", dpt->dp_cpuid_list[0],
338 		    dperrtype[dpt->dp_err]);
339 
340 		dp_buf_write(hdl, dpt);
341 
342 		/*
343 		 * Check for an existing DP_ERROR on cmd.cmd_datapaths, and
344 		 * if found, remove the DP_ERROR and close the case before
345 		 * creating the DP_FAULT case.
346 		 */
347 		ept = cmd_dp_lookup_error(dpt);
348 		if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) {
349 			fmd_hdl_debug(hdl, "%s: existing datapath error "
350 			    "overtaken by datapath fault\n", funcname);
351 			fmd_timer_remove(hdl, ept->dp_id);
352 			cmd_dp_destroy(hdl, ept);
353 		}
354 
355 		dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header,
356 		    CMD_PTR_DP_CASE, &uuidp);
357 		fmd_case_setprincipal(hdl, dpt->dp_case, ep);
358 
359 		/* Add suspect(s) and solve the case. */
360 		cmd_dp_add_suspects(hdl, dpt);
361 		fmd_case_solve(hdl, dpt->dp_case);
362 
363 		/* add it to cmd.cmd_datapaths */
364 		cmd_list_append(&cmd.cmd_datapaths, dpt);
365 
366 		--cmd.cmd_dp_flag;
367 		if (cmd.cmd_dp_flag == 0)
368 			cmd_dp_page_replay(hdl);
369 
370 		break;
371 
372 	default:
373 		fmd_hdl_debug(hdl, "%s: unknown ereport type", funcname);
374 		fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
375 		return (CMD_EVD_UNUSED);
376 	}
377 
378 	return (CMD_EVD_OK);
379 }
380 
381 cmd_evdisp_t
382 cmd_dp_cds(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
383     cmd_errcl_t clcode)
384 {
385 	if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
386 		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
387 		    SC_DP_CDS_TYPE));
388 	} else
389 		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
390 		    SG_DP_CDS_TYPE));
391 }
392 
393 cmd_evdisp_t
394 cmd_dp_dx(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
395     cmd_errcl_t clcode)
396 {
397 	if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
398 		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
399 		    SC_DP_DX_TYPE));
400 
401 	} else
402 		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
403 		    SG_DP_DX_TYPE));
404 }
405 
406 cmd_evdisp_t
407 cmd_dp_ex(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
408     cmd_errcl_t clcode)
409 {
410 	return (cmd_dp_common(hdl, ep, nvl, class, clcode,
411 	    SC_DP_EX_TYPE));
412 }
413 
414 cmd_evdisp_t
415 cmd_dp_cp(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
416     cmd_errcl_t clcode)
417 {
418 	if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
419 		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
420 		    SC_DP_CP_TYPE));
421 	} else
422 		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
423 		    SG_DP_RP_TYPE));
424 }
425