1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Ereport-handling routines for Datapath errors 30 * - receive datapath ereports and open datapath case 31 * - solve datapath case when datapath fault ereports are received 32 * - maintain state of datapath error flag 33 * - close datapath case when timeout occurs (w/o fault) 34 */ 35 36 37 #include <strings.h> 38 #include <string.h> 39 #include <errno.h> 40 #include <fm/fmd_api.h> 41 #include <sys/fm/protocol.h> 42 #include <sys/async.h> 43 #include <sys/time.h> 44 #include <cmd.h> 45 #include <cmd_state.h> 46 #include <cmd_dp.h> 47 #include <cmd_dp_page.h> 48 #include <cmd_page.h> 49 #include <libnvpair.h> 50 #include <sys/plat_datapath.h> 51 52 /* 53 * Member Name Data Type Comments 54 * ----------- --------- ----------- 55 * version uint8 0 56 * class string "asic" 57 * ENA uint64 ENA Format 1 58 * detector fmri aggregated ID data for SC-DE 59 * 60 * Datapath ereport subclasses and data payloads: 61 * There will be two types of ereports (error and fault) which will be 62 * identified by the "type" member. 63 * 64 * ereport.asic.*.cds.cds-dp 65 * ereport.asic.*.dx.dx-dp 66 * ereport.asic.*.sdi.sdi-dp 67 * ereport.asic.*.cp.cp-dp 68 * ereport.asic.*.rp.rp-dp // serengeti doesn't use "cp" term 69 * 70 * Member Name Data Type Comments 71 * ----------- --------- ----------- 72 * erptype uint16 derived from message type: error or 73 * fault 74 * t-value uint32 SC's datapath SERD timeout threshold 75 * dp-list-sz uint8 number of dp-list array elements 76 * dp-list array of uint16 Safari IDs of affected cpus 77 */ 78 79 static char *dperrtype[] = { 80 DP_ERROR_CDS, /* Starcat types */ 81 DP_ERROR_DX, 82 DP_ERROR_EX, 83 DP_ERROR_CP, 84 DP_ERROR_CDS, /* Serengeti types */ 85 DP_ERROR_DX, 86 DP_ERROR_RP 87 }; 88 89 /* 90 * Construct the ASRU(s)/FRU(s) associated with a data path fault, 91 * construct the fault(s), and add the suspect(s) to the case 92 * 93 */ 94 void 95 cmd_dp_add_suspects(fmd_hdl_t *hdl, cmd_dp_t *dp) 96 { 97 const char *funcname = "cmd_dp_add_suspects()"; 98 char class[DP_MAX_CLASS]; 99 char frustr[3][DP_MAX_FRU]; 100 int cpuid, numfru, sgpos, xcpos, i, err; 101 nvlist_t *asru, *fru = NULL, *flt, *hcel; 102 103 /* build ASRU, fault event class */ 104 asru = cmd_dp_setasru(hdl, dp); 105 (void) snprintf(class, DP_MAX_CLASS, "fault.asic.%s.%s", 106 dperrtype[dp->dp_err], FM_ERROR_DATAPATH); 107 108 cpuid = dp->dp_cpuid_list[0]; 109 110 /* extract fru position */ 111 sgpos = ((cpuid & 0x1f) / 4); 112 xcpos = ((cpuid >> 5) & 0x1f); 113 114 /* build FRU(s) for the particular error */ 115 numfru = 0; 116 switch (dp->dp_err) { 117 case SC_DP_CDS_TYPE: 118 case SC_DP_DX_TYPE: 119 /* check for slot 1 (maxcat) */ 120 if ((cpuid >> 3) & 0x1) 121 (void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos); 122 else 123 (void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos); 124 125 numfru = 1; 126 break; 127 128 case SC_DP_EX_TYPE: 129 /* check for slot 1 (maxcat) */ 130 if ((cpuid >> 3) & 0x1) 131 (void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos); 132 else 133 (void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos); 134 135 (void) snprintf(frustr[1], DP_MAX_FRU, "EX%d", xcpos); 136 numfru = 2; 137 break; 138 139 case SC_DP_CP_TYPE: 140 /* no way to know which CP half, be generic */ 141 (void) snprintf(frustr[0], DP_MAX_FRU, "EX%d", xcpos); 142 (void) snprintf(frustr[1], DP_MAX_FRU, "CP"); 143 (void) snprintf(frustr[2], DP_MAX_FRU, "CS"); 144 numfru = 3; 145 break; 146 147 case SG_DP_CDS_TYPE: 148 case SG_DP_DX_TYPE: 149 (void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos); 150 numfru = 1; 151 break; 152 153 case SG_DP_RP_TYPE: 154 /* no way to know which RP, be generic */ 155 (void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos); 156 (void) snprintf(frustr[1], DP_MAX_FRU, "RP"); 157 numfru = 2; 158 break; 159 160 default: 161 fmd_hdl_debug(hdl, "%s: invalid DP error type %d", funcname, 162 dp->dp_err); 163 nvlist_free(asru); 164 return; 165 } 166 167 /* For each FRU, build an FMRI, create fault, add as suspect */ 168 for (i = 0; i < numfru; i++) { 169 /* build a FRU FMRI */ 170 if (nvlist_alloc(&hcel, NV_UNIQUE_NAME, 0) != 0) { 171 nvlist_free(asru); 172 return; 173 } 174 err = nvlist_add_string(hcel, FM_FMRI_HC_NAME, 175 FM_FMRI_LEGACY_HC); 176 err |= nvlist_add_string(hcel, FM_FMRI_HC_ID, frustr[i]); 177 if (err != 0) { 178 nvlist_free(hcel); 179 nvlist_free(asru); 180 return; 181 } 182 183 /* put it in an HC scheme */ 184 if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) { 185 nvlist_free(hcel); 186 nvlist_free(asru); 187 return; 188 } 189 err = nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION); 190 err |= nvlist_add_string(fru, FM_FMRI_SCHEME, 191 FM_FMRI_SCHEME_HC); 192 err |= nvlist_add_string(fru, FM_FMRI_HC_ROOT, ""); 193 err |= nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1); 194 err |= nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcel, 1); 195 if (err != 0) { 196 nvlist_free(fru); 197 nvlist_free(hcel); 198 nvlist_free(asru); 199 return; 200 } 201 202 /* create the fault, add to case. */ 203 flt = cmd_nvl_create_fault(hdl, class, 100/numfru, 204 asru, fru, NULL); 205 fmd_case_add_suspect(hdl, dp->dp_case, flt); 206 207 /* free up memory */ 208 nvlist_free(fru); 209 nvlist_free(hcel); 210 } 211 212 /* free up ASRU */ 213 nvlist_free(asru); 214 } 215 216 /*ARGSUSED*/ 217 cmd_evdisp_t 218 cmd_dp_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, 219 cmd_errcl_t clcode, uint8_t dperr) 220 { 221 const char *funcname = "cmd_dp_common()"; 222 const char *uuidp; 223 cmd_dp_t *dpt, *ept; 224 int err, i, fltflg; 225 uint16_t *cpuid_list; 226 uint64_t *serid_list; 227 uint32_t ncpuids; 228 229 /* extract common ereport contents */ 230 dpt = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_t), FMD_SLEEP); 231 dpt->dp_nodetype = CMD_NT_DP; 232 dpt->dp_version = CMD_DP_VERSION; 233 dpt->dp_err = dperr; 234 err = nvlist_lookup_pairs(nvl, 0, 235 DP_EREPORT_TYPE, DATA_TYPE_UINT16, &dpt->dp_erpt_type, 236 DP_TVALUE, DATA_TYPE_UINT32, &dpt->dp_t_value, 237 DP_LIST_SIZE, DATA_TYPE_UINT32, &ncpuids, NULL); 238 if (err != 0) { 239 fmd_hdl_debug(hdl, "%s: unable to verify ereport contents " 240 "(erptype, ena, t_value, dp_list_sz)", funcname); 241 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t)); 242 return (CMD_EVD_UNUSED); 243 } 244 245 /* extract cpuid list from ereport */ 246 err = nvlist_lookup_uint16_array(nvl, DP_LIST, &cpuid_list, 247 &ncpuids); 248 err |= nvlist_lookup_uint64_array(nvl, SN_LIST, &serid_list, 249 &ncpuids); 250 if (err != 0) { 251 fmd_hdl_debug(hdl, "%s: unable to verify ereport contents " 252 "(dp_list, sn_list)", funcname); 253 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t)); 254 return (CMD_EVD_UNUSED); 255 } 256 257 for (i = 0; i < ncpuids; i++) { 258 dpt->dp_cpuid_list[i] = cpuid_list[i]; 259 dpt->dp_serid_list[i] = serid_list[i]; 260 } 261 262 dpt->dp_ncpus = ncpuids; 263 264 switch (dpt->dp_erpt_type) { 265 266 case DP_ERROR: 267 268 /* 269 * Scan existing faults on cmd.cmd_datapaths. If each 270 * cpuid in the current datapath event already has an 271 * associated DP fault, this is an uninteresting event. 272 */ 273 fltflg = 0; 274 for (i = 0; i < ncpuids; i++) 275 if (cmd_dp_lookup_fault(hdl, cpuid_list[i]) != NULL) 276 fltflg++; 277 if (fltflg == ncpuids) { 278 fmd_hdl_debug(hdl, "%s: datapath fault(s) already " 279 "experienced, event uninteresting\n", funcname); 280 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t)); 281 return (CMD_EVD_UNUSED); 282 } 283 284 /* 285 * Check for an existing datapath error, and if found 286 * add this event to the existing case 287 */ 288 ept = cmd_dp_lookup_error(dpt); 289 if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) { 290 fmd_hdl_debug(hdl, "%s: found existing datapath error, " 291 "adding event to case\n", funcname); 292 fmd_case_add_ereport(hdl, ept->dp_case, ep); 293 /* check for t-value change */ 294 if (dpt->dp_t_value != ept->dp_t_value) { 295 fmd_event_t *ep; 296 297 fmd_timer_remove(hdl, ept->dp_id); 298 ep = fmd_case_getprincipal(hdl, ept->dp_case); 299 ept->dp_id = fmd_timer_install(hdl, 300 (void *)CMD_TIMERTYPE_DP, ep, 301 (hrtime_t)NANOSEC * 302 (dpt->dp_t_value + 120)); 303 } 304 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t)); 305 return (CMD_EVD_OK); 306 } 307 308 /* 309 * Didn't find an existing datapath error. Create a new 310 * case, add the event. Also, stash the datapath event on the 311 * cmd.cmd_datapaths list 312 */ 313 fmd_hdl_debug(hdl, "%s: new datapath error, create case and " 314 "add to cmd.cmd_datapaths\n", funcname); 315 ++cmd.cmd_dp_flag; 316 317 cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname), 318 "dp_err_%d_%s", dpt->dp_cpuid_list[0], 319 dperrtype[dpt->dp_err]); 320 321 dp_buf_write(hdl, dpt); 322 323 dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header, 324 CMD_PTR_DP_CASE, &uuidp); 325 fmd_case_setprincipal(hdl, dpt->dp_case, ep); 326 dpt->dp_id = fmd_timer_install(hdl, (void *)CMD_TIMERTYPE_DP, 327 ep, (hrtime_t)NANOSEC * (dpt->dp_t_value + 120)); 328 cmd_list_append(&cmd.cmd_datapaths, dpt); 329 break; 330 331 case DP_FAULT: 332 ++cmd.cmd_dp_flag; 333 dpt->dp_erpt_type = DP_FAULT; 334 dpt->dp_id = 0; 335 336 cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname), 337 "dp_flt_%d_%s", dpt->dp_cpuid_list[0], 338 dperrtype[dpt->dp_err]); 339 340 dp_buf_write(hdl, dpt); 341 342 /* 343 * Check for an existing DP_ERROR on cmd.cmd_datapaths, and 344 * if found, remove the DP_ERROR and close the case before 345 * creating the DP_FAULT case. 346 */ 347 ept = cmd_dp_lookup_error(dpt); 348 if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) { 349 fmd_hdl_debug(hdl, "%s: existing datapath error " 350 "overtaken by datapath fault\n", funcname); 351 fmd_timer_remove(hdl, ept->dp_id); 352 cmd_dp_destroy(hdl, ept); 353 } 354 355 dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header, 356 CMD_PTR_DP_CASE, &uuidp); 357 fmd_case_setprincipal(hdl, dpt->dp_case, ep); 358 359 /* Add suspect(s) and solve the case. */ 360 cmd_dp_add_suspects(hdl, dpt); 361 fmd_case_solve(hdl, dpt->dp_case); 362 363 /* add it to cmd.cmd_datapaths */ 364 cmd_list_append(&cmd.cmd_datapaths, dpt); 365 366 --cmd.cmd_dp_flag; 367 if (cmd.cmd_dp_flag == 0) 368 cmd_dp_page_replay(hdl); 369 370 break; 371 372 default: 373 fmd_hdl_debug(hdl, "%s: unknown ereport type", funcname); 374 fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t)); 375 return (CMD_EVD_UNUSED); 376 } 377 378 return (CMD_EVD_OK); 379 } 380 381 cmd_evdisp_t 382 cmd_dp_cds(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, 383 cmd_errcl_t clcode) 384 { 385 if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) { 386 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 387 SC_DP_CDS_TYPE)); 388 } else 389 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 390 SG_DP_CDS_TYPE)); 391 } 392 393 cmd_evdisp_t 394 cmd_dp_dx(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, 395 cmd_errcl_t clcode) 396 { 397 if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) { 398 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 399 SC_DP_DX_TYPE)); 400 401 } else 402 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 403 SG_DP_DX_TYPE)); 404 } 405 406 cmd_evdisp_t 407 cmd_dp_ex(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, 408 cmd_errcl_t clcode) 409 { 410 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 411 SC_DP_EX_TYPE)); 412 } 413 414 cmd_evdisp_t 415 cmd_dp_cp(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class, 416 cmd_errcl_t clcode) 417 { 418 if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) { 419 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 420 SC_DP_CP_TYPE)); 421 } else 422 return (cmd_dp_common(hdl, ep, nvl, class, clcode, 423 SG_DP_RP_TYPE)); 424 } 425