1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * Ereport-handling routines for Datapath errors
30 * - receive datapath ereports and open datapath case
31 * - solve datapath case when datapath fault ereports are received
32 * - maintain state of datapath error flag
33 * - close datapath case when timeout occurs (w/o fault)
34 */
35
36
37 #include <strings.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <fm/fmd_api.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/async.h>
43 #include <sys/time.h>
44 #include <cmd.h>
45 #include <cmd_state.h>
46 #include <cmd_dp.h>
47 #include <cmd_dp_page.h>
48 #include <cmd_page.h>
49 #include <libnvpair.h>
50 #include <sys/plat_datapath.h>
51
52 /*
53 * Member Name Data Type Comments
54 * ----------- --------- -----------
55 * version uint8 0
56 * class string "asic"
57 * ENA uint64 ENA Format 1
58 * detector fmri aggregated ID data for SC-DE
59 *
60 * Datapath ereport subclasses and data payloads:
61 * There will be two types of ereports (error and fault) which will be
62 * identified by the "type" member.
63 *
64 * ereport.asic.*.cds.cds-dp
65 * ereport.asic.*.dx.dx-dp
66 * ereport.asic.*.sdi.sdi-dp
67 * ereport.asic.*.cp.cp-dp
68 * ereport.asic.*.rp.rp-dp // serengeti doesn't use "cp" term
69 *
70 * Member Name Data Type Comments
71 * ----------- --------- -----------
72 * erptype uint16 derived from message type: error or
73 * fault
74 * t-value uint32 SC's datapath SERD timeout threshold
75 * dp-list-sz uint8 number of dp-list array elements
76 * dp-list array of uint16 Safari IDs of affected cpus
77 */
78
79 static char *dperrtype[] = {
80 DP_ERROR_CDS, /* Starcat types */
81 DP_ERROR_DX,
82 DP_ERROR_EX,
83 DP_ERROR_CP,
84 DP_ERROR_CDS, /* Serengeti types */
85 DP_ERROR_DX,
86 DP_ERROR_RP
87 };
88
89 /*
90 * Construct the ASRU(s)/FRU(s) associated with a data path fault,
91 * construct the fault(s), and add the suspect(s) to the case
92 *
93 */
94 void
cmd_dp_add_suspects(fmd_hdl_t * hdl,cmd_dp_t * dp)95 cmd_dp_add_suspects(fmd_hdl_t *hdl, cmd_dp_t *dp)
96 {
97 const char *funcname = "cmd_dp_add_suspects()";
98 char class[DP_MAX_CLASS];
99 char frustr[3][DP_MAX_FRU];
100 int cpuid, numfru, sgpos, xcpos, i, err;
101 nvlist_t *asru, *fru = NULL, *flt, *hcel;
102
103 /* build ASRU, fault event class */
104 asru = cmd_dp_setasru(hdl, dp);
105 (void) snprintf(class, DP_MAX_CLASS, "fault.asic.%s.%s",
106 dperrtype[dp->dp_err], FM_ERROR_DATAPATH);
107
108 cpuid = dp->dp_cpuid_list[0];
109
110 /* extract fru position */
111 sgpos = ((cpuid & 0x1f) / 4);
112 xcpos = ((cpuid >> 5) & 0x1f);
113
114 /* build FRU(s) for the particular error */
115 numfru = 0;
116 switch (dp->dp_err) {
117 case SC_DP_CDS_TYPE:
118 case SC_DP_DX_TYPE:
119 /* check for slot 1 (maxcat) */
120 if ((cpuid >> 3) & 0x1)
121 (void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos);
122 else
123 (void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos);
124
125 numfru = 1;
126 break;
127
128 case SC_DP_EX_TYPE:
129 /* check for slot 1 (maxcat) */
130 if ((cpuid >> 3) & 0x1)
131 (void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos);
132 else
133 (void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos);
134
135 (void) snprintf(frustr[1], DP_MAX_FRU, "EX%d", xcpos);
136 numfru = 2;
137 break;
138
139 case SC_DP_CP_TYPE:
140 /* no way to know which CP half, be generic */
141 (void) snprintf(frustr[0], DP_MAX_FRU, "EX%d", xcpos);
142 (void) snprintf(frustr[1], DP_MAX_FRU, "CP");
143 (void) snprintf(frustr[2], DP_MAX_FRU, "CS");
144 numfru = 3;
145 break;
146
147 case SG_DP_CDS_TYPE:
148 case SG_DP_DX_TYPE:
149 (void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos);
150 numfru = 1;
151 break;
152
153 case SG_DP_RP_TYPE:
154 /* no way to know which RP, be generic */
155 (void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos);
156 (void) snprintf(frustr[1], DP_MAX_FRU, "RP");
157 numfru = 2;
158 break;
159
160 default:
161 fmd_hdl_debug(hdl, "%s: invalid DP error type %d", funcname,
162 dp->dp_err);
163 nvlist_free(asru);
164 return;
165 }
166
167 /* For each FRU, build an FMRI, create fault, add as suspect */
168 for (i = 0; i < numfru; i++) {
169 /* build a FRU FMRI */
170 if (nvlist_alloc(&hcel, NV_UNIQUE_NAME, 0) != 0) {
171 nvlist_free(asru);
172 return;
173 }
174 err = nvlist_add_string(hcel, FM_FMRI_HC_NAME,
175 FM_FMRI_LEGACY_HC);
176 err |= nvlist_add_string(hcel, FM_FMRI_HC_ID, frustr[i]);
177 if (err != 0) {
178 nvlist_free(hcel);
179 nvlist_free(asru);
180 return;
181 }
182
183 /* put it in an HC scheme */
184 if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) {
185 nvlist_free(hcel);
186 nvlist_free(asru);
187 return;
188 }
189 err = nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION);
190 err |= nvlist_add_string(fru, FM_FMRI_SCHEME,
191 FM_FMRI_SCHEME_HC);
192 err |= nvlist_add_string(fru, FM_FMRI_HC_ROOT, "");
193 err |= nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1);
194 err |= nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcel, 1);
195 if (err != 0) {
196 nvlist_free(fru);
197 nvlist_free(hcel);
198 nvlist_free(asru);
199 return;
200 }
201
202 /* create the fault, add to case. */
203 flt = cmd_nvl_create_fault(hdl, class, 100/numfru,
204 asru, fru, NULL);
205 fmd_case_add_suspect(hdl, dp->dp_case, flt);
206
207 /* free up memory */
208 nvlist_free(fru);
209 nvlist_free(hcel);
210 }
211
212 /* free up ASRU */
213 nvlist_free(asru);
214 }
215
216 /*ARGSUSED*/
217 cmd_evdisp_t
cmd_dp_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode,uint8_t dperr)218 cmd_dp_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
219 cmd_errcl_t clcode, uint8_t dperr)
220 {
221 const char *funcname = "cmd_dp_common()";
222 const char *uuidp;
223 cmd_dp_t *dpt, *ept;
224 int err, i, fltflg;
225 uint16_t *cpuid_list;
226 uint64_t *serid_list;
227 uint32_t ncpuids;
228
229 /* extract common ereport contents */
230 dpt = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_t), FMD_SLEEP);
231 dpt->dp_nodetype = CMD_NT_DP;
232 dpt->dp_version = CMD_DP_VERSION;
233 dpt->dp_err = dperr;
234 err = nvlist_lookup_pairs(nvl, 0,
235 DP_EREPORT_TYPE, DATA_TYPE_UINT16, &dpt->dp_erpt_type,
236 DP_TVALUE, DATA_TYPE_UINT32, &dpt->dp_t_value,
237 DP_LIST_SIZE, DATA_TYPE_UINT32, &ncpuids, NULL);
238 if (err != 0) {
239 fmd_hdl_debug(hdl, "%s: unable to verify ereport contents "
240 "(erptype, ena, t_value, dp_list_sz)", funcname);
241 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
242 return (CMD_EVD_UNUSED);
243 }
244
245 /* extract cpuid list from ereport */
246 err = nvlist_lookup_uint16_array(nvl, DP_LIST, &cpuid_list,
247 &ncpuids);
248 err |= nvlist_lookup_uint64_array(nvl, SN_LIST, &serid_list,
249 &ncpuids);
250 if (err != 0) {
251 fmd_hdl_debug(hdl, "%s: unable to verify ereport contents "
252 "(dp_list, sn_list)", funcname);
253 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
254 return (CMD_EVD_UNUSED);
255 }
256
257 for (i = 0; i < ncpuids; i++) {
258 dpt->dp_cpuid_list[i] = cpuid_list[i];
259 dpt->dp_serid_list[i] = serid_list[i];
260 }
261
262 dpt->dp_ncpus = ncpuids;
263
264 switch (dpt->dp_erpt_type) {
265
266 case DP_ERROR:
267
268 /*
269 * Scan existing faults on cmd.cmd_datapaths. If each
270 * cpuid in the current datapath event already has an
271 * associated DP fault, this is an uninteresting event.
272 */
273 fltflg = 0;
274 for (i = 0; i < ncpuids; i++)
275 if (cmd_dp_lookup_fault(hdl, cpuid_list[i]) != NULL)
276 fltflg++;
277 if (fltflg == ncpuids) {
278 fmd_hdl_debug(hdl, "%s: datapath fault(s) already "
279 "experienced, event uninteresting\n", funcname);
280 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
281 return (CMD_EVD_UNUSED);
282 }
283
284 /*
285 * Check for an existing datapath error, and if found
286 * add this event to the existing case
287 */
288 ept = cmd_dp_lookup_error(dpt);
289 if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) {
290 fmd_hdl_debug(hdl, "%s: found existing datapath error, "
291 "adding event to case\n", funcname);
292 fmd_case_add_ereport(hdl, ept->dp_case, ep);
293 /* check for t-value change */
294 if (dpt->dp_t_value != ept->dp_t_value) {
295 fmd_event_t *ep;
296
297 fmd_timer_remove(hdl, ept->dp_id);
298 ep = fmd_case_getprincipal(hdl, ept->dp_case);
299 ept->dp_id = fmd_timer_install(hdl,
300 (void *)CMD_TIMERTYPE_DP, ep,
301 (hrtime_t)NANOSEC *
302 (dpt->dp_t_value + 120));
303 }
304 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
305 return (CMD_EVD_OK);
306 }
307
308 /*
309 * Didn't find an existing datapath error. Create a new
310 * case, add the event. Also, stash the datapath event on the
311 * cmd.cmd_datapaths list
312 */
313 fmd_hdl_debug(hdl, "%s: new datapath error, create case and "
314 "add to cmd.cmd_datapaths\n", funcname);
315 ++cmd.cmd_dp_flag;
316
317 cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname),
318 "dp_err_%d_%s", dpt->dp_cpuid_list[0],
319 dperrtype[dpt->dp_err]);
320
321 dp_buf_write(hdl, dpt);
322
323 dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header,
324 CMD_PTR_DP_CASE, &uuidp);
325 fmd_case_setprincipal(hdl, dpt->dp_case, ep);
326 dpt->dp_id = fmd_timer_install(hdl, (void *)CMD_TIMERTYPE_DP,
327 ep, (hrtime_t)NANOSEC * (dpt->dp_t_value + 120));
328 cmd_list_append(&cmd.cmd_datapaths, dpt);
329 break;
330
331 case DP_FAULT:
332 ++cmd.cmd_dp_flag;
333 dpt->dp_erpt_type = DP_FAULT;
334 dpt->dp_id = 0;
335
336 cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname),
337 "dp_flt_%d_%s", dpt->dp_cpuid_list[0],
338 dperrtype[dpt->dp_err]);
339
340 dp_buf_write(hdl, dpt);
341
342 /*
343 * Check for an existing DP_ERROR on cmd.cmd_datapaths, and
344 * if found, remove the DP_ERROR and close the case before
345 * creating the DP_FAULT case.
346 */
347 ept = cmd_dp_lookup_error(dpt);
348 if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) {
349 fmd_hdl_debug(hdl, "%s: existing datapath error "
350 "overtaken by datapath fault\n", funcname);
351 fmd_timer_remove(hdl, ept->dp_id);
352 cmd_dp_destroy(hdl, ept);
353 }
354
355 dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header,
356 CMD_PTR_DP_CASE, &uuidp);
357 fmd_case_setprincipal(hdl, dpt->dp_case, ep);
358
359 /* Add suspect(s) and solve the case. */
360 cmd_dp_add_suspects(hdl, dpt);
361 fmd_case_solve(hdl, dpt->dp_case);
362
363 /* add it to cmd.cmd_datapaths */
364 cmd_list_append(&cmd.cmd_datapaths, dpt);
365
366 --cmd.cmd_dp_flag;
367 if (cmd.cmd_dp_flag == 0)
368 cmd_dp_page_replay(hdl);
369
370 break;
371
372 default:
373 fmd_hdl_debug(hdl, "%s: unknown ereport type", funcname);
374 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
375 return (CMD_EVD_UNUSED);
376 }
377
378 return (CMD_EVD_OK);
379 }
380
381 cmd_evdisp_t
cmd_dp_cds(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)382 cmd_dp_cds(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
383 cmd_errcl_t clcode)
384 {
385 if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
386 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
387 SC_DP_CDS_TYPE));
388 } else
389 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
390 SG_DP_CDS_TYPE));
391 }
392
393 cmd_evdisp_t
cmd_dp_dx(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)394 cmd_dp_dx(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
395 cmd_errcl_t clcode)
396 {
397 if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
398 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
399 SC_DP_DX_TYPE));
400
401 } else
402 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
403 SG_DP_DX_TYPE));
404 }
405
406 cmd_evdisp_t
cmd_dp_ex(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)407 cmd_dp_ex(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
408 cmd_errcl_t clcode)
409 {
410 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
411 SC_DP_EX_TYPE));
412 }
413
414 cmd_evdisp_t
cmd_dp_cp(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)415 cmd_dp_cp(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
416 cmd_errcl_t clcode)
417 {
418 if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
419 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
420 SC_DP_CP_TYPE));
421 } else
422 return (cmd_dp_common(hdl, ep, nvl, class, clcode,
423 SG_DP_RP_TYPE));
424 }
425