1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * OPL platform specific functions for 29 * CPU/Memory error diagnosis engine. 30 */ 31 #include <cmd.h> 32 #include <cmd_dimm.h> 33 #include <cmd_bank.h> 34 #include <cmd_page.h> 35 #include <cmd_opl.h> 36 #include <string.h> 37 #include <errno.h> 38 #include <fcntl.h> 39 #include <unistd.h> 40 #include <dirent.h> 41 #include <sys/stat.h> 42 43 #include <sys/fm/protocol.h> 44 #include <sys/fm/io/opl_mc_fm.h> 45 #include <sys/async.h> 46 #include <sys/opl_olympus_regs.h> 47 #include <sys/fm/cpu/SPARC64-VI.h> 48 #include <sys/int_const.h> 49 #include <sys/mutex.h> 50 #include <sys/dditypes.h> 51 #include <opl/sys/mc-opl.h> 52 53 /* 54 * The following is the common function for handling 55 * memory UE with EID=MEM. 56 * The error could be detected by either CPU/IO. 57 */ 58 cmd_evdisp_t 59 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 60 int hdlr_type) 61 { 62 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL; 63 uint64_t ubc_ue_log_reg, pa; 64 cmd_page_t *page; 65 66 if (nvlist_lookup_nvlist(nvl, 67 FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0) 68 return (CMD_EVD_BAD); 69 70 switch (hdlr_type) { 71 case CMD_OPL_HDLR_CPU: 72 73 if (nvlist_lookup_uint64(nvl, 74 FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0) 75 return (CMD_EVD_BAD); 76 77 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n", 78 (u_longlong_t)pa); 79 break; 80 81 case CMD_OPL_HDLR_IO: 82 83 if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE, 84 &ubc_ue_log_reg) != 0) 85 return (CMD_EVD_BAD); 86 87 pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK); 88 89 fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n", 90 (u_longlong_t)ubc_ue_log_reg); 91 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n", 92 (u_longlong_t)pa); 93 break; 94 95 default: 96 97 return (CMD_EVD_BAD); 98 } 99 100 if ((page = cmd_page_lookup(pa)) != NULL && 101 page->page_case.cc_cp != NULL && 102 fmd_case_solved(hdl, page->page_case.cc_cp)) 103 return (CMD_EVD_REDUND); 104 105 if (nvlist_dup(rsrc, &asru, 0) != 0) { 106 fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n"); 107 return (CMD_EVD_BAD); 108 } 109 110 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 111 nvlist_free(asru); 112 CMD_STAT_BUMP(bad_mem_asru); 113 return (CMD_EVD_BAD); 114 } 115 116 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) { 117 nvlist_free(asru); 118 return (CMD_EVD_BAD); 119 } 120 121 cmd_page_fault(hdl, asru, fru, ep, pa); 122 nvlist_free(asru); 123 nvlist_free(fru); 124 return (CMD_EVD_OK); 125 } 126 127 /* 128 * The following is the main function to handle generating 129 * the sibling cpu suspect list for the CPU detected UE 130 * error cases. This is to handle the 131 * multiple strand/core architecture on the OPL platform. 132 */ 133 cmd_evdisp_t 134 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep, 135 const char *class, const char *fltname, 136 cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu, 137 cmd_case_t *cc, uint8_t cpumask) 138 { 139 const char *uuid; 140 cmd_cpu_t *main_cpu, *sib_cpu; 141 nvlist_t *fmri; 142 cmd_list_t *cpu_list; 143 opl_cpu_t *opl_cpu; 144 uint32_t main_cpuid, nsusp = 1; 145 uint8_t cert; 146 147 fmd_hdl_debug(hdl, 148 "Enter OPL_CPUUE_HANDLER for class %x\n", class); 149 150 main_cpu = cpu; 151 main_cpuid = cpu->cpu_cpuid; 152 153 if (strcmp(fltname, "core") == 0) 154 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid, 155 IS_CORE); 156 else if (strcmp(fltname, "chip") == 0) 157 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid, 158 IS_CHIP); 159 else 160 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid, 161 IS_STRAND); 162 163 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL; 164 opl_cpu = cmd_list_next(opl_cpu)) { 165 if (opl_cpu->oc_cpuid == main_cpuid) { 166 sib_cpu = main_cpu; 167 opl_cpu->oc_cmd_cpu = main_cpu; 168 } else { 169 fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask); 170 if (fmri == NULL) { 171 opl_cpu->oc_cmd_cpu = NULL; 172 fmd_hdl_debug(hdl, 173 "missing asru, cpuid %u excluded\n", 174 opl_cpu->oc_cpuid); 175 continue; 176 } 177 178 sib_cpu = cmd_cpu_lookup(hdl, fmri, class, 179 CMD_CPU_LEVEL_THREAD); 180 if (sib_cpu == NULL || sib_cpu->cpu_faulting) { 181 nvlist_free(fmri); 182 opl_cpu->oc_cmd_cpu = NULL; 183 fmd_hdl_debug(hdl, 184 "cpu not present, cpuid %u excluded\n", 185 opl_cpu->oc_cpuid); 186 continue; 187 } 188 opl_cpu->oc_cmd_cpu = sib_cpu; 189 nvlist_free(fmri); 190 nsusp++; 191 } 192 if (cpu->cpu_cpuid == main_cpuid) { 193 if (cc->cc_cp != NULL && 194 fmd_case_solved(hdl, cc->cc_cp)) { 195 if (cpu_list != NULL) 196 opl_cpulist_free(hdl, cpu_list); 197 return (CMD_EVD_REDUND); 198 } 199 200 if (cc->cc_cp == NULL) 201 cc->cc_cp = cmd_case_create(hdl, 202 &cpu->cpu_header, ptr, &uuid); 203 204 if (cc->cc_serdnm != NULL) { 205 fmd_hdl_debug(hdl, 206 "destroying existing %s state for class %x\n", 207 cc->cc_serdnm, class); 208 fmd_serd_destroy(hdl, cc->cc_serdnm); 209 fmd_hdl_strfree(hdl, cc->cc_serdnm); 210 cc->cc_serdnm = NULL; 211 fmd_case_reset(hdl, cc->cc_cp); 212 } 213 fmd_case_add_ereport(hdl, cc->cc_cp, ep); 214 } 215 } 216 cert = opl_avg(100, nsusp); 217 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL; 218 opl_cpu = cmd_list_next(opl_cpu)) { 219 if (opl_cpu->oc_cmd_cpu != NULL) { 220 nvlist_t *cpu_rsrc; 221 222 cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid); 223 if (cpu_rsrc == NULL) { 224 fmd_hdl_debug(hdl, 225 "missing rsrc, cpuid %u excluded\n", 226 opl_cpu->oc_cpuid); 227 continue; 228 } 229 cmd_cpu_create_faultlist(hdl, cc->cc_cp, 230 opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert); 231 nvlist_free(cpu_rsrc); 232 } 233 } 234 fmd_case_solve(hdl, cc->cc_cp); 235 if (cpu_list != NULL) 236 opl_cpulist_free(hdl, cpu_list); 237 return (CMD_EVD_OK); 238 } 239 240 /* 241 * Generates DIMM fault if the number of Permanent CE 242 * threshold is exceeded. 243 */ 244 static void 245 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 246 { 247 nvlist_t *dflt; 248 fmd_case_t *cp; 249 250 fmd_hdl_debug(hdl, 251 "Permanent CE event threshold checking.\n"); 252 253 if (dimm->dimm_flags & CMD_MEM_F_FAULTING) { 254 /* We've already complained about this DIMM */ 255 return; 256 } 257 258 if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl, 259 "max_perm_ce_dimm")) { 260 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 261 cp = fmd_case_open(hdl, NULL); 262 dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm", 263 CMD_FLTMAXCONF); 264 fmd_case_add_suspect(hdl, cp, dflt); 265 fmd_case_solve(hdl, cp); 266 } 267 } 268 269 /* 270 * Notify fault page information (pa and errlog) to XSCF via mc-opl 271 */ 272 #define MC_PHYDEV_DIR "/devices" 273 #define MC_PHYPREFIX "pseudo-mc@" 274 static int 275 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl) 276 { 277 uint32_t *eadd, *elog; 278 uint_t n; 279 uint64_t pa; 280 char path[MAXPATHLEN]; 281 char *unum; 282 nvlist_t *rsrc; 283 DIR *mcdir; 284 struct dirent *dp; 285 mc_flt_page_t flt_page; 286 cmd_page_t *page; 287 struct stat statbuf; 288 289 /* 290 * Extract ereport. 291 * Sanity check of pa is already done at cmd_opl_mac_common(). 292 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG, 293 * and MC_OPL_BANK. 294 */ 295 if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) || 296 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) || 297 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) { 298 fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n"); 299 return (-1); 300 } 301 if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE, 302 &rsrc) != 0) { 303 fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n"); 304 return (-1); 305 } 306 if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) { 307 fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n"); 308 return (-1); 309 } 310 311 page = cmd_page_lookup(pa); 312 if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) { 313 /* 314 * fault.memory.page will not be created. 315 */ 316 return (0); 317 } 318 319 flt_page.err_add = eadd[0]; 320 flt_page.err_log = elog[0]; 321 flt_page.fmri_addr = (uint64_t)(uint32_t)unum; 322 flt_page.fmri_sz = strlen(unum) + 1; 323 324 fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n", 325 unum, strlen(unum) + 1); 326 fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n", 327 pa, eadd[0], elog[0]); 328 329 if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) { 330 while ((dp = readdir(mcdir)) != NULL) { 331 int fd; 332 333 if (strncmp(dp->d_name, MC_PHYPREFIX, 334 strlen(MC_PHYPREFIX)) != 0) 335 continue; 336 337 (void) snprintf(path, sizeof (path), 338 "%s/%s", MC_PHYDEV_DIR, dp->d_name); 339 340 if (stat(path, &statbuf) != 0 || 341 (statbuf.st_mode & S_IFCHR) == 0) { 342 /* skip if not a character device */ 343 continue; 344 } 345 346 if ((fd = open(path, O_RDONLY)) < 0) 347 continue; 348 349 if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) { 350 fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n", 351 path); 352 (void) close(fd); 353 (void) closedir(mcdir); 354 return (0); 355 } 356 (void) close(fd); 357 } 358 (void) closedir(mcdir); 359 } 360 361 fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n"); 362 363 return (-1); 364 } 365 366 /* 367 * This is the common function for processing MAC detected 368 * Intermittent and Permanent CEs. 369 */ 370 371 cmd_evdisp_t 372 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class, 373 nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl) 374 { 375 cmd_dimm_t *dimm; 376 const char *uuid; 377 378 fmd_hdl_debug(hdl, 379 "Processing CE ereport\n"); 380 381 if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL && 382 (dimm = cmd_dimm_create(hdl, asru)) == NULL) 383 return (CMD_EVD_UNUSED); 384 385 if (dimm->dimm_case.cc_cp == NULL) { 386 dimm->dimm_case.cc_cp = cmd_case_create(hdl, 387 &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid); 388 } 389 390 if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) { 391 CMD_STAT_BUMP(ce_interm); 392 fmd_hdl_debug(hdl, "adding FJ-Intermittent event " 393 "to CE serd engine\n"); 394 395 if (dimm->dimm_case.cc_serdnm == NULL) { 396 dimm->dimm_case.cc_serdnm = 397 cmd_mem_serdnm_create(hdl, 398 "dimm", dimm->dimm_unum); 399 fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm, 400 fmd_prop_get_int32(hdl, "ce_n"), 401 fmd_prop_get_int64(hdl, "ce_t")); 402 } 403 404 if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) == 405 FMD_B_FALSE) { 406 return (CMD_EVD_OK); /* engine hasn't fired */ 407 } 408 fmd_hdl_debug(hdl, "ce serd fired\n"); 409 fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp, 410 dimm->dimm_case.cc_serdnm); 411 fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm); 412 413 (void) opl_scf_log(hdl, nvl); 414 } else { 415 CMD_STAT_BUMP(ce_sticky); 416 } 417 418 dimm->dimm_nretired++; 419 dimm->dimm_retstat.fmds_value.ui64++; 420 cmd_dimm_dirty(hdl, dimm); 421 422 cmd_page_fault(hdl, asru, fru, ep, pa); 423 opl_ce_thresh_check(hdl, dimm); 424 425 return (CMD_EVD_OK); 426 } 427 428 /* 429 * This is the common entry for processing MAC detected errors. 430 * It is responsible for generating the memory page fault event. 431 * The permanent CE (sticky) in normal mode is handled here also 432 * in the same way as in the UE case. 433 */ 434 /*ARGSUSED*/ 435 cmd_evdisp_t 436 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 437 const char *class, cmd_errcl_t clcode) 438 { 439 uint64_t pa; 440 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL; 441 cmd_page_t *page; 442 443 fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode); 444 445 if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0) 446 return (CMD_EVD_BAD); 447 448 if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) 449 != 0) 450 return (CMD_EVD_BAD); 451 452 /* 453 * Check for invalid pa. 454 * The most sig. bit should not be on. 455 * It would be out of the range of possible pa 456 * in MAC's view. 457 */ 458 if (((uint64_t)1 << 63) & pa) 459 return (CMD_EVD_BAD); 460 461 if ((page = cmd_page_lookup(pa)) != NULL && 462 page->page_case.cc_cp != NULL && 463 fmd_case_solved(hdl, page->page_case.cc_cp)) 464 return (CMD_EVD_REDUND); 465 466 if (nvlist_dup(rsrc, &asru, 0) != 0) { 467 fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n"); 468 return (CMD_EVD_BAD); 469 } 470 471 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 472 fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n"); 473 nvlist_free(asru); 474 CMD_STAT_BUMP(bad_mem_asru); 475 return (CMD_EVD_BAD); 476 } 477 478 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) { 479 fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n"); 480 nvlist_free(asru); 481 return (CMD_EVD_BAD); 482 } 483 484 /* 485 * process PCE and ICE to create DIMM fault 486 */ 487 if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 || 488 strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 || 489 strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) { 490 cmd_evdisp_t ret; 491 492 ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl); 493 nvlist_free(asru); 494 nvlist_free(fru); 495 if (ret != CMD_EVD_OK) { 496 fmd_hdl_debug(hdl, 497 "cmd_opl_mac_common: mac_ce failed\n"); 498 return (CMD_EVD_BAD); 499 } else 500 return (CMD_EVD_OK); 501 } 502 503 /* The following code handles page retires for UEs and CMPEs. */ 504 505 cmd_page_fault(hdl, asru, fru, ep, pa); 506 nvlist_free(asru); 507 nvlist_free(fru); 508 return (CMD_EVD_OK); 509 } 510 511 /* 512 * Common entry points for handling CPU/IO detected UE with 513 * respect to EID=MEM. 514 */ 515 /*ARGSUSED*/ 516 cmd_evdisp_t 517 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 518 const char *class, cmd_errcl_t clcode) 519 { 520 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU)); 521 } 522 523 /*ARGSUSED*/ 524 cmd_evdisp_t 525 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 526 const char *class, cmd_errcl_t clcode) 527 { 528 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO)); 529 } 530