1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Ereport-handling routines for memory errors 27 */ 28 29 #include <cmd_mem.h> 30 #include <cmd_dimm.h> 31 #include <cmd_bank.h> 32 #include <cmd_page.h> 33 #include <cmd_cpu.h> 34 #ifdef sun4u 35 #include <cmd_dp.h> 36 #include <cmd_dp_page.h> 37 #endif 38 #include <cmd.h> 39 40 #include <strings.h> 41 #include <string.h> 42 #include <errno.h> 43 #include <limits.h> 44 #include <fm/fmd_api.h> 45 #include <sys/fm/protocol.h> 46 #include <sys/async.h> 47 #include <sys/errclassify.h> 48 #include <assert.h> 49 50 #ifdef sun4v 51 #include <cmd_hc_sun4v.h> 52 #endif /* sun4v */ 53 54 struct ce_name2type { 55 const char *name; 56 ce_dispact_t type; 57 }; 58 59 ce_dispact_t 60 cmd_mem_name2type(const char *name, int minorvers) 61 { 62 static const struct ce_name2type old[] = { 63 { ERR_TYPE_DESC_INTERMITTENT, CE_DISP_INTERMITTENT }, 64 { ERR_TYPE_DESC_PERSISTENT, CE_DISP_PERS }, 65 { ERR_TYPE_DESC_STICKY, CE_DISP_STICKY }, 66 { ERR_TYPE_DESC_UNKNOWN, CE_DISP_UNKNOWN }, 67 { NULL } 68 }; 69 static const struct ce_name2type new[] = { 70 { CE_DISP_DESC_U, CE_DISP_UNKNOWN }, 71 { CE_DISP_DESC_I, CE_DISP_INTERMITTENT }, 72 { CE_DISP_DESC_PP, CE_DISP_POSS_PERS }, 73 { CE_DISP_DESC_P, CE_DISP_PERS }, 74 { CE_DISP_DESC_L, CE_DISP_LEAKY }, 75 { CE_DISP_DESC_PS, CE_DISP_POSS_STICKY }, 76 { CE_DISP_DESC_S, CE_DISP_STICKY }, 77 { NULL } 78 }; 79 const struct ce_name2type *names = (minorvers == 0) ? &old[0] : &new[0]; 80 const struct ce_name2type *tp; 81 82 for (tp = names; tp->name != NULL; tp++) 83 if (strcasecmp(name, tp->name) == 0) 84 return (tp->type); 85 86 return (CE_DISP_UNKNOWN); 87 } 88 89 /* 90 * check if a dimm has n CEs with the same symbol-in-error 91 */ 92 static int 93 upos_thresh_check(cmd_dimm_t *dimm, uint16_t upos, uint32_t threshold) 94 { 95 int i; 96 cmd_mq_t *ip, *next; 97 int count = 0; 98 99 for (i = 0; i < CMD_MAX_CKWDS; i++) { 100 for (ip = cmd_list_next(&dimm->mq_root[i]); ip != NULL; 101 ip = next) { 102 next = cmd_list_next(ip); 103 if (ip->mq_unit_position == upos) { 104 count++; 105 if (count >= threshold) 106 return (1); 107 } 108 } 109 } 110 return (0); 111 } 112 113 /* 114 * check if smaller number of retired pages > 1/16 of larger 115 * number of retired pages 116 */ 117 static int 118 check_bad_rw_retired_pages(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2) 119 { 120 uint_t sret, lret; 121 double ratio; 122 uint_t d1_nretired, d2_nretired; 123 124 sret = lret = 0; 125 126 d1_nretired = d1->dimm_nretired; 127 d2_nretired = d2->dimm_nretired; 128 129 if (d1->dimm_bank != NULL) 130 d1_nretired += d1->dimm_bank->bank_nretired; 131 132 if (d2->dimm_bank != NULL) 133 d2_nretired += d2->dimm_bank->bank_nretired; 134 135 if (d2_nretired < d1_nretired) { 136 sret = d2_nretired; 137 lret = d1_nretired; 138 } else if (d2_nretired > d1_nretired) { 139 sret = d1_nretired; 140 lret = d2_nretired; 141 } else 142 return (0); 143 144 ratio = lret * CMD_PAGE_RATIO; 145 146 if (sret > ratio) { 147 fmd_hdl_debug(hdl, "sret=%d lret=%d ratio=%.3f\n", 148 sret, lret, ratio); 149 return (1); 150 } 151 return (0); 152 } 153 154 /* 155 * check bad rw between two DIMMs 156 * the check succeeds if 157 * - each DIMM has 4 CEs with the same symbol-in-error. 158 * - the smaller number of retired pages > 1/16 larger number of retired pages 159 */ 160 static int 161 check_bad_rw_between_dimms(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2, 162 uint16_t *rupos) 163 { 164 int i; 165 cmd_mq_t *ip, *next; 166 uint16_t upos; 167 168 for (i = 0; i < CMD_MAX_CKWDS; i++) { 169 for (ip = cmd_list_next(&d1->mq_root[i]); ip != NULL; 170 ip = next) { 171 next = cmd_list_next(ip); 172 upos = ip->mq_unit_position; 173 if (upos_thresh_check(d1, upos, cmd.cmd_nupos)) { 174 if (upos_thresh_check(d2, upos, 175 cmd.cmd_nupos)) { 176 if (check_bad_rw_retired_pages(hdl, 177 d1, d2)) { 178 *rupos = upos; 179 return (1); 180 } 181 } 182 } 183 } 184 } 185 186 return (0); 187 } 188 189 static void 190 bad_reader_writer_check(fmd_hdl_t *hdl, cmd_dimm_t *ce_dimm, nvlist_t *det) 191 { 192 cmd_dimm_t *d, *next; 193 uint16_t upos; 194 195 for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) { 196 next = cmd_list_next(d); 197 if (d == ce_dimm) 198 continue; 199 if (!cmd_same_datapath_dimms(ce_dimm, d)) 200 continue; 201 if (check_bad_rw_between_dimms(hdl, ce_dimm, d, &upos)) { 202 cmd_gen_datapath_fault(hdl, ce_dimm, d, upos, det); 203 cmd_dimm_save_symbol_error(ce_dimm, upos); 204 fmd_hdl_debug(hdl, 205 "check_bad_rw_dimms succeeded: %s %s", 206 ce_dimm->dimm_unum, d->dimm_unum); 207 return; 208 } 209 } 210 } 211 212 /* 213 * rule 5a checking. The check succeeds if 214 * - nretired >= 512 215 * - nretired >= 128 and (addr_hi - addr_low) / (nretired - 1) > 512KB 216 */ 217 static void 218 ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 219 { 220 nvlist_t *flt; 221 fmd_case_t *cp; 222 uint_t nret; 223 uint64_t delta_addr = 0; 224 225 if (dimm->dimm_flags & CMD_MEM_F_FAULTING) 226 /* We've already complained about this DIMM */ 227 return; 228 229 nret = dimm->dimm_nretired; 230 if (dimm->dimm_bank != NULL) 231 nret += dimm->dimm_bank->bank_nretired; 232 233 if (nret < cmd.cmd_low_ce_thresh) 234 return; 235 236 if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low) 237 delta_addr = 238 (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) / 239 (nret - 1); 240 241 if (nret >= cmd.cmd_hi_ce_thresh || delta_addr > CMD_MQ_512KB) { 242 243 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 244 cmd_dimm_dirty(hdl, dimm); 245 246 cp = fmd_case_open(hdl, NULL); 247 flt = cmd_dimm_create_fault(hdl, dimm, 248 "fault.memory.dimm-page-retires-excessive", CMD_FLTMAXCONF); 249 fmd_case_add_suspect(hdl, cp, flt); 250 fmd_case_solve(hdl, cp); 251 fmd_hdl_debug(hdl, "ce_thresh_check succeeded nretired %d\n", 252 nret); 253 254 } 255 } 256 257 /* 258 * rule 5b checking. The check succeeds if 259 * more than 120 non-intermittent CEs are reported against one symbol 260 * position of one afar in 72 hours. 261 */ 262 static void 263 mq_5b_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 264 { 265 nvlist_t *flt; 266 fmd_case_t *cp; 267 cmd_mq_t *ip, *next; 268 int cw; 269 270 for (cw = 0; cw < CMD_MAX_CKWDS; cw++) { 271 for (ip = cmd_list_next(&dimm->mq_root[cw]); 272 ip != NULL; ip = next) { 273 next = cmd_list_next(ip); 274 if (ip->mq_dupce_count >= cmd.cmd_dupce) { 275 cp = fmd_case_open(hdl, NULL); 276 flt = cmd_dimm_create_fault(hdl, dimm, 277 "fault.memory.dimm-page-retires-excessive", 278 CMD_FLTMAXCONF); 279 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 280 cmd_dimm_dirty(hdl, dimm); 281 fmd_case_add_suspect(hdl, cp, flt); 282 fmd_case_solve(hdl, cp); 283 fmd_hdl_debug(hdl, 284 "mq_5b_check succeeded: duplicate CE=%d", 285 ip->mq_dupce_count); 286 return; 287 } 288 } 289 } 290 } 291 292 /* 293 * delete the expired duplicate CE time stamps 294 */ 295 void 296 mq_prune_dup(fmd_hdl_t *hdl, cmd_mq_t *ip, uint64_t now) 297 { 298 tstamp_t *tsp, *next; 299 300 for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL; 301 tsp = next) { 302 next = cmd_list_next(tsp); 303 if (tsp->tstamp < now - CMD_MQ_TIMELIM) { 304 cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l); 305 fmd_hdl_free(hdl, tsp, sizeof (tstamp_t)); 306 ip->mq_dupce_count--; 307 } 308 } 309 } 310 311 void 312 mq_update(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_mq_t *ip, uint64_t now, 313 uint32_t cpuid) 314 { 315 tstamp_t *tsp; 316 317 ip->mq_tstamp = now; 318 ip->mq_cpuid = cpuid; 319 ip->mq_ep = ep; 320 321 if (fmd_serd_exists(hdl, ip->mq_serdnm)) 322 fmd_serd_destroy(hdl, ip->mq_serdnm); 323 fmd_serd_create(hdl, ip->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT); 324 (void) fmd_serd_record(hdl, ip->mq_serdnm, ep); 325 326 tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP); 327 tsp->tstamp = now; 328 cmd_list_append(&ip->mq_dupce_tstamp, tsp); 329 ip->mq_dupce_count++; 330 } 331 332 /* Create a fresh index block for MQSC CE correlation. */ 333 cmd_mq_t * 334 mq_create(fmd_hdl_t *hdl, fmd_event_t *ep, 335 uint64_t afar, uint16_t upos, uint64_t now, uint32_t cpuid) 336 { 337 cmd_mq_t *cp; 338 tstamp_t *tsp; 339 uint16_t ckwd = (afar & 0x30) >> 4; 340 341 cp = fmd_hdl_zalloc(hdl, sizeof (cmd_mq_t), FMD_SLEEP); 342 cp->mq_tstamp = now; 343 cp->mq_ckwd = ckwd; 344 cp->mq_phys_addr = afar; 345 cp->mq_unit_position = upos; 346 cp->mq_ep = ep; 347 cp->mq_serdnm = 348 cmd_mq_serdnm_create(hdl, "mq", afar, ckwd, upos); 349 350 tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP); 351 tsp->tstamp = now; 352 cmd_list_append(&cp->mq_dupce_tstamp, tsp); 353 cp->mq_dupce_count = 1; 354 cp->mq_cpuid = cpuid; 355 356 /* 357 * Create SERD to keep this event from being removed 358 * by fmd which may not know there is an event pointer 359 * saved here. This SERD is *never* meant to fire. 360 * NOTE: wouldn't need to do this if there were an fmd 361 * api to 'hold' an event. 362 */ 363 if (fmd_serd_exists(hdl, cp->mq_serdnm)) { 364 /* clean up dup */ 365 fmd_serd_destroy(hdl, cp->mq_serdnm); 366 } 367 fmd_serd_create(hdl, cp->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT); 368 (void) fmd_serd_record(hdl, cp->mq_serdnm, ep); 369 370 return (cp); 371 } 372 373 /* Destroy MQSC tracking block as well as event tracking SERD. */ 374 375 cmd_mq_t * 376 mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip) 377 { 378 cmd_mq_t *jp = cmd_list_next(ip); 379 tstamp_t *tsp, *next; 380 381 if (ip->mq_serdnm != NULL) { 382 if (fmd_serd_exists(hdl, ip->mq_serdnm)) 383 fmd_serd_destroy(hdl, ip->mq_serdnm); 384 fmd_hdl_strfree(hdl, ip->mq_serdnm); 385 ip->mq_serdnm = NULL; 386 } 387 388 for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL; 389 tsp = next) { 390 next = cmd_list_next(tsp); 391 cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l); 392 fmd_hdl_free(hdl, tsp, sizeof (tstamp_t)); 393 } 394 395 cmd_list_delete(lp, &ip->mq_l); 396 fmd_hdl_free(hdl, ip, sizeof (cmd_mq_t)); 397 398 return (jp); 399 } 400 401 /* 402 * Add an index block for a new CE, sorted 403 * a) by ascending unit position 404 * b) order of arrival (~= time order) 405 */ 406 407 void 408 mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep, 409 uint64_t afar, uint16_t synd, uint64_t now, uint32_t cpuid) 410 { 411 cmd_mq_t *ip, *jp; 412 int cw, unit_position; 413 414 cw = (afar & 0x30) >> 4; /* 0:3 */ 415 if ((unit_position = cmd_synd2upos(synd)) < 0) 416 return; /* not a CE */ 417 418 for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) { 419 if (ip->mq_unit_position > unit_position) { 420 /* list is in unit position order */ 421 break; 422 } else if (ip->mq_unit_position == unit_position && 423 ip->mq_phys_addr == afar) { 424 /* 425 * Found a duplicate cw, unit_position, and afar. 426 * update the mq_t with the new information 427 */ 428 mq_update(hdl, ep, ip, now, cpuid); 429 return; 430 } else { 431 ip = cmd_list_next(ip); 432 } 433 } 434 435 jp = mq_create(hdl, ep, afar, unit_position, now, cpuid); 436 if (ip == NULL) 437 cmd_list_append(&dimm->mq_root[cw], jp); 438 else 439 cmd_list_insert_before(&dimm->mq_root[cw], ip, jp); 440 } 441 442 /* 443 * Prune the MQSC index lists (one for each checkword), by deleting 444 * outdated index blocks from each list. 445 */ 446 447 void 448 mq_prune(fmd_hdl_t *hdl, cmd_dimm_t *dimm, uint64_t now) 449 { 450 cmd_mq_t *ip; 451 int cw; 452 453 for (cw = 0; cw < CMD_MAX_CKWDS; cw++) { 454 for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) { 455 if (ip->mq_tstamp < now - CMD_MQ_TIMELIM) { 456 /* 457 * This event has timed out - delete the 458 * mq block as well as serd for the event. 459 */ 460 ip = mq_destroy(hdl, &dimm->mq_root[cw], ip); 461 } else { 462 /* tstamp < now - ce_t */ 463 mq_prune_dup(hdl, ip, now); 464 ip = cmd_list_next(ip); 465 } 466 } /* per checkword */ 467 } /* cw = 0...3 */ 468 } 469 470 /* 471 * Check the MQSC index lists (one for each checkword) by making a 472 * complete pass through each list, checking if the criteria for 473 * Rule 4A has been met. Rule 4A checking is done for each checkword. 474 * 475 * Rule 4A: fault a DIMM "whenever Solaris reports two or more CEs from 476 * two or more different physical addresses on each of two or more different 477 * bit positions from the same DIMM within 72 hours of each other, and all 478 * the addresses are in the same relative checkword (that is, the AFARs 479 * are all the same modulo 64). [Note: This means at least 4 CEs; two 480 * from one bit position, with unique addresses, and two from another, 481 * also with unique addresses, and the lower 6 bits of all the addresses 482 * are the same." 483 */ 484 485 void 486 mq_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm) 487 { 488 int upos_pairs, curr_upos, cw, i, j; 489 nvlist_t *flt; 490 typedef struct upos_pair { 491 int upos; 492 cmd_mq_t *mq1; 493 cmd_mq_t *mq2; 494 } upos_pair_t; 495 upos_pair_t upos_array[8]; /* max per cw = 2, * 4 cw's */ 496 cmd_mq_t *ip; 497 498 /* 499 * Each upos_array[] member represents a pair of CEs for the same 500 * unit position (symbol) which on a sun4u is a bit, and on sun4v 501 * is a (4 bit) nibble. 502 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM 503 * for rule 4A, and same DRAM for rule 4B) for a violation - this 504 * is why CE pairs are tracked. 505 */ 506 upos_pairs = 0; 507 upos_array[0].mq1 = NULL; 508 509 /* Loop through all checkwords */ 510 for (cw = 0; cw < CMD_MAX_CKWDS; cw++) { 511 i = upos_pairs; 512 curr_upos = -1; 513 514 /* 515 * mq_root[] is an array of cumulative lists of CEs 516 * indexed by checkword where the list is in unit position 517 * order. Loop through checking for duplicate unit position 518 * entries (filled in at mq_create()). 519 * The upos_array[] is filled in each time a duplicate 520 * unit position is found; the first time through the loop 521 * of a unit position sets curr_upos but does not fill in 522 * upos_array[] until the second symbol is found. 523 */ 524 for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; 525 ip = cmd_list_next(ip)) { 526 if (curr_upos != ip->mq_unit_position) { 527 /* Set initial current position */ 528 curr_upos = ip->mq_unit_position; 529 } else if (i > upos_pairs && 530 curr_upos == upos_array[i-1].upos) { 531 /* 532 * Only keep track of CE pairs; skip 533 * triples, quads, etc... 534 */ 535 continue; 536 } else if (upos_array[i].mq1 == NULL) { 537 /* 538 * Have a pair, add to upos_array[]. 539 */ 540 upos_array[i].upos = curr_upos; 541 upos_array[i].mq1 = cmd_list_prev(ip); 542 upos_array[i].mq2 = ip; 543 upos_array[++i].mq1 = NULL; 544 } 545 } 546 547 if (i - upos_pairs >= 2) { 548 /* Rule 4A Violation. */ 549 flt = cmd_dimm_create_fault(hdl, 550 dimm, "fault.memory.dimm-ue-imminent", 551 CMD_FLTMAXCONF); 552 for (j = upos_pairs; j < i; j++) { 553 fmd_case_add_ereport(hdl, 554 dimm->dimm_case.cc_cp, 555 upos_array[j].mq1->mq_ep); 556 fmd_case_add_ereport(hdl, 557 dimm->dimm_case.cc_cp, 558 upos_array[j].mq2->mq_ep); 559 } 560 dimm->dimm_flags |= CMD_MEM_F_FAULTING; 561 cmd_dimm_dirty(hdl, dimm); 562 fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt); 563 fmd_case_solve(hdl, dimm->dimm_case.cc_cp); 564 return; 565 } 566 upos_pairs = i; 567 assert(upos_pairs < 8); 568 } 569 } 570 571 /*ARGSUSED*/ 572 cmd_evdisp_t 573 cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 574 const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd, 575 uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru) 576 { 577 cmd_dimm_t *dimm; 578 cmd_page_t *page; 579 const char *uuid; 580 uint64_t *now; 581 uint_t nelem; 582 uint32_t cpuid; 583 nvlist_t *det; 584 uint64_t addr; 585 int skip_error = 0; 586 587 if (afar_status != AFLT_STAT_VALID || 588 synd_status != AFLT_STAT_VALID) 589 return (CMD_EVD_UNUSED); 590 591 if ((page = cmd_page_lookup(afar)) != NULL && 592 page->page_case.cc_cp != NULL && 593 fmd_case_solved(hdl, page->page_case.cc_cp)) 594 return (CMD_EVD_REDUND); 595 596 #ifdef sun4u 597 if (cmd_dp_error(hdl) || cmd_dp_fault(hdl, afar)) { 598 CMD_STAT_BUMP(dp_ignored_ce); 599 return (CMD_EVD_UNUSED); 600 } 601 #endif /* sun4u */ 602 603 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 604 CMD_STAT_BUMP(bad_mem_asru); 605 return (CMD_EVD_BAD); 606 } 607 608 if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL && 609 (dimm = cmd_dimm_create(hdl, asru)) == NULL) 610 return (CMD_EVD_UNUSED); 611 612 if (dimm->dimm_case.cc_cp == NULL) { 613 dimm->dimm_case.cc_cp = cmd_case_create(hdl, 614 &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid); 615 } 616 617 if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det) != 0) 618 return (CMD_EVD_BAD); 619 620 /* 621 * Add to MQSC correlation lists all CEs which pass validity 622 * checks above. 623 * Add mq_t when there is no bad r/w or dimm fault. 624 * Always prune the expired mq_t. 625 */ 626 skip_error = cmd_dimm_check_symbol_error(dimm, synd); 627 628 if (nvlist_lookup_uint64_array(nvl, 629 "__tod", &now, &nelem) == 0) { 630 631 if (!skip_error || !(dimm->dimm_flags & CMD_MEM_F_FAULTING)) { 632 if (nvlist_lookup_uint32(det, FM_FMRI_CPU_ID, &cpuid) 633 != 0) 634 cpuid = ULONG_MAX; 635 636 mq_add(hdl, dimm, ep, afar, synd, *now, cpuid); 637 } 638 639 mq_prune(hdl, dimm, *now); 640 641 if (!skip_error) 642 bad_reader_writer_check(hdl, dimm, det); 643 644 if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) { 645 mq_check(hdl, dimm); 646 mq_5b_check(hdl, dimm); 647 } 648 } 649 650 switch (type) { 651 case CE_DISP_UNKNOWN: 652 CMD_STAT_BUMP(ce_unknown); 653 return (CMD_EVD_UNUSED); 654 case CE_DISP_INTERMITTENT: 655 CMD_STAT_BUMP(ce_interm); 656 return (CMD_EVD_UNUSED); 657 case CE_DISP_POSS_PERS: 658 CMD_STAT_BUMP(ce_ppersis); 659 break; 660 case CE_DISP_PERS: 661 CMD_STAT_BUMP(ce_persis); 662 break; 663 case CE_DISP_LEAKY: 664 CMD_STAT_BUMP(ce_leaky); 665 break; 666 case CE_DISP_POSS_STICKY: 667 { 668 uchar_t ptnrinfo = CE_XDIAG_PTNRINFO(disp); 669 670 if (CE_XDIAG_TESTVALID(ptnrinfo)) { 671 int ce1 = CE_XDIAG_CE1SEEN(ptnrinfo); 672 int ce2 = CE_XDIAG_CE2SEEN(ptnrinfo); 673 674 if (ce1 && ce2) { 675 /* Should have been CE_DISP_STICKY */ 676 return (CMD_EVD_BAD); 677 } else if (ce1) { 678 /* Partner could see and could fix CE */ 679 CMD_STAT_BUMP(ce_psticky_ptnrclrd); 680 } else { 681 /* Partner could not see ce1 (ignore ce2) */ 682 CMD_STAT_BUMP(ce_psticky_ptnrnoerr); 683 } 684 } else { 685 CMD_STAT_BUMP(ce_psticky_noptnr); 686 } 687 return (CMD_EVD_UNUSED); 688 } 689 case CE_DISP_STICKY: 690 CMD_STAT_BUMP(ce_sticky); 691 break; 692 default: 693 return (CMD_EVD_BAD); 694 } 695 696 if (cmd_dimm_check_symbol_error(dimm, synd)) 697 return (CMD_EVD_REDUND); 698 699 if (page == NULL) 700 page = cmd_page_create(hdl, asru, afar); 701 702 if (page->page_case.cc_cp == NULL) { 703 page->page_case.cc_cp = cmd_case_create(hdl, 704 &page->page_header, CMD_PTR_PAGE_CASE, &uuid); 705 } 706 707 switch (type) { 708 case CE_DISP_POSS_PERS: 709 case CE_DISP_PERS: 710 fmd_hdl_debug(hdl, "adding %sPersistent event to CE serd " 711 "engine\n", type == CE_DISP_POSS_PERS ? "Possible-" : ""); 712 713 if (page->page_case.cc_serdnm == NULL) { 714 page->page_case.cc_serdnm = cmd_page_serdnm_create(hdl, 715 "page", page->page_physbase); 716 717 fmd_serd_create(hdl, page->page_case.cc_serdnm, 718 fmd_prop_get_int32(hdl, "ce_n"), 719 fmd_prop_get_int64(hdl, "ce_t")); 720 } 721 722 if (fmd_serd_record(hdl, page->page_case.cc_serdnm, ep) == 723 FMD_B_FALSE) 724 return (CMD_EVD_OK); /* engine hasn't fired */ 725 726 fmd_hdl_debug(hdl, "ce page serd fired\n"); 727 fmd_case_add_serd(hdl, page->page_case.cc_cp, 728 page->page_case.cc_serdnm); 729 fmd_serd_reset(hdl, page->page_case.cc_serdnm); 730 break; /* to retire */ 731 732 case CE_DISP_LEAKY: 733 case CE_DISP_STICKY: 734 fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep); 735 break; /* to retire */ 736 } 737 738 if (page->page_flags & CMD_MEM_F_FAULTING || 739 fmd_nvl_fmri_unusable(hdl, page->page_asru_nvl)) 740 return (CMD_EVD_OK); 741 742 /* 743 * convert a unhashed address to hashed address 744 */ 745 cmd_to_hashed_addr(&addr, afar, class); 746 747 if (afar > dimm->dimm_phys_addr_hi) 748 dimm->dimm_phys_addr_hi = addr; 749 750 if (afar < dimm->dimm_phys_addr_low) 751 dimm->dimm_phys_addr_low = addr; 752 753 dimm->dimm_nretired++; 754 dimm->dimm_retstat.fmds_value.ui64++; 755 cmd_dimm_dirty(hdl, dimm); 756 757 cmd_page_fault(hdl, asru, cmd_dimm_fru(dimm), ep, afar); 758 ce_thresh_check(hdl, dimm); 759 760 return (CMD_EVD_OK); 761 } 762 763 /* 764 * Solve a bank case with suspect "fault.memory.bank". The caller must 765 * have populated bank->bank_case.cc_cp and is also responsible for adding 766 * associated ereport(s) to that case. 767 */ 768 void 769 cmd_bank_fault(fmd_hdl_t *hdl, cmd_bank_t *bank) 770 { 771 fmd_case_t *cp = bank->bank_case.cc_cp; 772 nvlist_t *flt; 773 774 if (bank->bank_flags & CMD_MEM_F_FAULTING) 775 return; /* Only complain once per bank */ 776 777 bank->bank_flags |= CMD_MEM_F_FAULTING; 778 cmd_bank_dirty(hdl, bank); 779 780 #ifdef sun4u 781 flt = cmd_bank_create_fault(hdl, bank, "fault.memory.bank", 782 CMD_FLTMAXCONF); 783 fmd_case_add_suspect(hdl, cp, flt); 784 #else /* sun4v */ 785 { 786 cmd_bank_memb_t *d; 787 788 /* create separate fault for each dimm in bank */ 789 790 for (d = cmd_list_next(&bank->bank_dimms); 791 d != NULL; d = cmd_list_next(d)) { 792 flt = cmd_dimm_create_fault(hdl, d->bm_dimm, 793 "fault.memory.bank", CMD_FLTMAXCONF); 794 fmd_case_add_suspect(hdl, cp, flt); 795 } 796 } 797 #endif /* sun4u */ 798 fmd_case_solve(hdl, cp); 799 } 800 801 /*ARGSUSED*/ 802 cmd_evdisp_t 803 cmd_ue_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 804 const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd, 805 uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru) 806 { 807 cmd_page_t *page; 808 cmd_bank_t *bank; 809 cmd_cpu_t *cpu; 810 811 #ifdef sun4u 812 /* 813 * Note: Currently all sun4u processors using this code share 814 * L2 and L3 cache at CMD_CPU_LEVEL_CORE. 815 */ 816 cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class, 817 CMD_CPU_LEVEL_CORE); 818 #else /* sun4v */ 819 cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class, 820 CMD_CPU_LEVEL_THREAD); 821 #endif /* sun4u */ 822 823 if (cpu == NULL) { 824 fmd_hdl_debug(hdl, "cmd_ue_common: cpu not found\n"); 825 return (CMD_EVD_UNUSED); 826 } 827 828 /* 829 * The following code applies only to sun4u, because sun4u does 830 * not poison data in L2 cache resulting from the fetch of a 831 * memory UE. 832 */ 833 834 #ifdef sun4u 835 if (afar_status != AFLT_STAT_VALID) { 836 /* 837 * Had this report's AFAR been valid, it would have 838 * contributed an address to the UE cache. We don't 839 * know what the AFAR would have been, and thus we can't 840 * add anything to the cache. If a xxU is caused by 841 * this UE, we won't be able to detect it, and will thus 842 * erroneously offline the CPU. To prevent this 843 * situation, we need to assume that all xxUs generated 844 * through the next E$ flush are attributable to the UE. 845 */ 846 cmd_cpu_uec_set_allmatch(hdl, cpu); 847 } else { 848 cmd_cpu_uec_add(hdl, cpu, afar); 849 } 850 #endif /* sun4u */ 851 852 if (synd_status != AFLT_STAT_VALID) { 853 fmd_hdl_debug(hdl, "cmd_ue_common: syndrome not valid\n"); 854 return (CMD_EVD_UNUSED); 855 } 856 857 if (cmd_mem_synd_check(hdl, afar, afar_status, synd, synd_status, 858 cpu) == CMD_EVD_UNUSED) 859 return (CMD_EVD_UNUSED); 860 861 if (afar_status != AFLT_STAT_VALID) 862 return (CMD_EVD_UNUSED); 863 864 if ((page = cmd_page_lookup(afar)) != NULL && 865 page->page_case.cc_cp != NULL && 866 fmd_case_solved(hdl, page->page_case.cc_cp)) 867 return (CMD_EVD_REDUND); 868 869 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 870 CMD_STAT_BUMP(bad_mem_asru); 871 return (NULL); 872 } 873 874 if ((bank = cmd_bank_lookup(hdl, asru)) == NULL && 875 (bank = cmd_bank_create(hdl, asru)) == NULL) 876 return (CMD_EVD_UNUSED); 877 878 #ifdef sun4v 879 { 880 nvlist_t *fmri; 881 char **snarray; 882 unsigned int i, n; 883 884 /* 885 * 1: locate the array of serial numbers inside the bank asru. 886 * 2: for each serial #, lookup its mem: FMRI in libtopo 887 * 3: ensure that each DIMM's FMRI is on bank's dimmlist 888 */ 889 890 if (nvlist_lookup_string_array(asru, 891 FM_FMRI_MEM_SERIAL_ID, &snarray, &n) != 0) 892 fmd_hdl_abort(hdl, "Cannot locate serial #s for bank"); 893 894 for (i = 0; i < n; i++) { 895 fmri = cmd_find_dimm_by_sn(hdl, FM_FMRI_SCHEME_MEM, 896 snarray[i]); 897 /* 898 * If dimm structure doesn't already exist for 899 * each dimm, create and link to bank. 900 */ 901 if (cmd_dimm_lookup(hdl, fmri) == NULL) 902 (void) cmd_dimm_create(hdl, fmri); 903 nvlist_free(fmri); 904 } 905 } 906 #endif /* sun4v */ 907 908 if (bank->bank_case.cc_cp == NULL) { 909 const char *uuid; 910 bank->bank_case.cc_cp = cmd_case_create(hdl, &bank->bank_header, 911 CMD_PTR_BANK_CASE, &uuid); 912 } 913 914 #ifdef sun4u 915 if (cmd_dp_error(hdl)) { 916 CMD_STAT_BUMP(dp_deferred_ue); 917 cmd_dp_page_defer(hdl, asru, ep, afar); 918 return (CMD_EVD_OK); 919 } else if (cmd_dp_fault(hdl, afar)) { 920 CMD_STAT_BUMP(dp_ignored_ue); 921 return (CMD_EVD_UNUSED); 922 } 923 #endif /* sun4u */ 924 925 fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep); 926 927 bank->bank_nretired++; 928 bank->bank_retstat.fmds_value.ui64++; 929 cmd_bank_dirty(hdl, bank); 930 931 cmd_page_fault(hdl, bank->bank_asru_nvl, cmd_bank_fru(bank), ep, afar); 932 cmd_bank_fault(hdl, bank); 933 934 return (CMD_EVD_OK); 935 } 936 937 void 938 cmd_dimm_close(fmd_hdl_t *hdl, void *arg) 939 { 940 cmd_dimm_destroy(hdl, arg); 941 } 942 943 void 944 cmd_bank_close(fmd_hdl_t *hdl, void *arg) 945 { 946 cmd_bank_destroy(hdl, arg); 947 } 948