1 /* 2 * GHES/EDAC Linux driver 3 * 4 * This file may be distributed under the terms of the GNU General Public 5 * License version 2. 6 * 7 * Copyright (c) 2013 by Mauro Carvalho Chehab 8 * 9 * Red Hat Inc. http://www.redhat.com 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <acpi/ghes.h> 15 #include <linux/edac.h> 16 #include <linux/dmi.h> 17 #include "edac_module.h" 18 #include <ras/ras_event.h> 19 20 struct ghes_edac_pvt { 21 struct list_head list; 22 struct ghes *ghes; 23 struct mem_ctl_info *mci; 24 25 /* Buffers for the error handling routine */ 26 char detail_location[240]; 27 char other_detail[160]; 28 char msg[80]; 29 }; 30 31 static atomic_t ghes_init = ATOMIC_INIT(0); 32 static struct ghes_edac_pvt *ghes_pvt; 33 34 /* 35 * Sync with other, potentially concurrent callers of 36 * ghes_edac_report_mem_error(). We don't know what the 37 * "inventive" firmware would do. 38 */ 39 static DEFINE_SPINLOCK(ghes_lock); 40 41 /* "ghes_edac.force_load=1" skips the platform check */ 42 static bool __read_mostly force_load; 43 module_param(force_load, bool, 0); 44 45 /* Memory Device - Type 17 of SMBIOS spec */ 46 struct memdev_dmi_entry { 47 u8 type; 48 u8 length; 49 u16 handle; 50 u16 phys_mem_array_handle; 51 u16 mem_err_info_handle; 52 u16 total_width; 53 u16 data_width; 54 u16 size; 55 u8 form_factor; 56 u8 device_set; 57 u8 device_locator; 58 u8 bank_locator; 59 u8 memory_type; 60 u16 type_detail; 61 u16 speed; 62 u8 manufacturer; 63 u8 serial_number; 64 u8 asset_tag; 65 u8 part_number; 66 u8 attributes; 67 u32 extended_size; 68 u16 conf_mem_clk_speed; 69 } __attribute__((__packed__)); 70 71 struct ghes_edac_dimm_fill { 72 struct mem_ctl_info *mci; 73 unsigned count; 74 }; 75 76 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) 77 { 78 int *num_dimm = arg; 79 80 if (dh->type == DMI_ENTRY_MEM_DEVICE) 81 (*num_dimm)++; 82 } 83 84 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) 85 { 86 struct ghes_edac_dimm_fill *dimm_fill = arg; 87 struct mem_ctl_info *mci = dimm_fill->mci; 88 89 if (dh->type == DMI_ENTRY_MEM_DEVICE) { 90 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; 91 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 92 mci->n_layers, 93 dimm_fill->count, 0, 0); 94 u16 rdr_mask = BIT(7) | BIT(13); 95 96 if (entry->size == 0xffff) { 97 pr_info("Can't get DIMM%i size\n", 98 dimm_fill->count); 99 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ 100 } else if (entry->size == 0x7fff) { 101 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); 102 } else { 103 if (entry->size & BIT(15)) 104 dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10); 105 else 106 dimm->nr_pages = MiB_TO_PAGES(entry->size); 107 } 108 109 switch (entry->memory_type) { 110 case 0x12: 111 if (entry->type_detail & BIT(13)) 112 dimm->mtype = MEM_RDDR; 113 else 114 dimm->mtype = MEM_DDR; 115 break; 116 case 0x13: 117 if (entry->type_detail & BIT(13)) 118 dimm->mtype = MEM_RDDR2; 119 else 120 dimm->mtype = MEM_DDR2; 121 break; 122 case 0x14: 123 dimm->mtype = MEM_FB_DDR2; 124 break; 125 case 0x18: 126 if (entry->type_detail & BIT(12)) 127 dimm->mtype = MEM_NVDIMM; 128 else if (entry->type_detail & BIT(13)) 129 dimm->mtype = MEM_RDDR3; 130 else 131 dimm->mtype = MEM_DDR3; 132 break; 133 case 0x1a: 134 if (entry->type_detail & BIT(12)) 135 dimm->mtype = MEM_NVDIMM; 136 else if (entry->type_detail & BIT(13)) 137 dimm->mtype = MEM_RDDR4; 138 else 139 dimm->mtype = MEM_DDR4; 140 break; 141 default: 142 if (entry->type_detail & BIT(6)) 143 dimm->mtype = MEM_RMBS; 144 else if ((entry->type_detail & rdr_mask) == rdr_mask) 145 dimm->mtype = MEM_RDR; 146 else if (entry->type_detail & BIT(7)) 147 dimm->mtype = MEM_SDR; 148 else if (entry->type_detail & BIT(9)) 149 dimm->mtype = MEM_EDO; 150 else 151 dimm->mtype = MEM_UNKNOWN; 152 } 153 154 /* 155 * Actually, we can only detect if the memory has bits for 156 * checksum or not 157 */ 158 if (entry->total_width == entry->data_width) 159 dimm->edac_mode = EDAC_NONE; 160 else 161 dimm->edac_mode = EDAC_SECDED; 162 163 dimm->dtype = DEV_UNKNOWN; 164 dimm->grain = 128; /* Likely, worse case */ 165 166 /* 167 * FIXME: It shouldn't be hard to also fill the DIMM labels 168 */ 169 170 if (dimm->nr_pages) { 171 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", 172 dimm_fill->count, edac_mem_types[dimm->mtype], 173 PAGES_TO_MiB(dimm->nr_pages), 174 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); 175 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", 176 entry->memory_type, entry->type_detail, 177 entry->total_width, entry->data_width); 178 } 179 180 dimm_fill->count++; 181 } 182 } 183 184 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) 185 { 186 enum hw_event_mc_err_type type; 187 struct edac_raw_error_desc *e; 188 struct mem_ctl_info *mci; 189 struct ghes_edac_pvt *pvt = ghes_pvt; 190 unsigned long flags; 191 char *p; 192 u8 grain_bits; 193 194 if (!pvt) 195 return; 196 197 /* 198 * We can do the locking below because GHES defers error processing 199 * from NMI to IRQ context. Whenever that changes, we'd at least 200 * know. 201 */ 202 if (WARN_ON_ONCE(in_nmi())) 203 return; 204 205 spin_lock_irqsave(&ghes_lock, flags); 206 207 mci = pvt->mci; 208 e = &mci->error_desc; 209 210 /* Cleans the error report buffer */ 211 memset(e, 0, sizeof (*e)); 212 e->error_count = 1; 213 strcpy(e->label, "unknown label"); 214 e->msg = pvt->msg; 215 e->other_detail = pvt->other_detail; 216 e->top_layer = -1; 217 e->mid_layer = -1; 218 e->low_layer = -1; 219 *pvt->other_detail = '\0'; 220 *pvt->msg = '\0'; 221 222 switch (sev) { 223 case GHES_SEV_CORRECTED: 224 type = HW_EVENT_ERR_CORRECTED; 225 break; 226 case GHES_SEV_RECOVERABLE: 227 type = HW_EVENT_ERR_UNCORRECTED; 228 break; 229 case GHES_SEV_PANIC: 230 type = HW_EVENT_ERR_FATAL; 231 break; 232 default: 233 case GHES_SEV_NO: 234 type = HW_EVENT_ERR_INFO; 235 } 236 237 edac_dbg(1, "error validation_bits: 0x%08llx\n", 238 (long long)mem_err->validation_bits); 239 240 /* Error type, mapped on e->msg */ 241 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 242 p = pvt->msg; 243 switch (mem_err->error_type) { 244 case 0: 245 p += sprintf(p, "Unknown"); 246 break; 247 case 1: 248 p += sprintf(p, "No error"); 249 break; 250 case 2: 251 p += sprintf(p, "Single-bit ECC"); 252 break; 253 case 3: 254 p += sprintf(p, "Multi-bit ECC"); 255 break; 256 case 4: 257 p += sprintf(p, "Single-symbol ChipKill ECC"); 258 break; 259 case 5: 260 p += sprintf(p, "Multi-symbol ChipKill ECC"); 261 break; 262 case 6: 263 p += sprintf(p, "Master abort"); 264 break; 265 case 7: 266 p += sprintf(p, "Target abort"); 267 break; 268 case 8: 269 p += sprintf(p, "Parity Error"); 270 break; 271 case 9: 272 p += sprintf(p, "Watchdog timeout"); 273 break; 274 case 10: 275 p += sprintf(p, "Invalid address"); 276 break; 277 case 11: 278 p += sprintf(p, "Mirror Broken"); 279 break; 280 case 12: 281 p += sprintf(p, "Memory Sparing"); 282 break; 283 case 13: 284 p += sprintf(p, "Scrub corrected error"); 285 break; 286 case 14: 287 p += sprintf(p, "Scrub uncorrected error"); 288 break; 289 case 15: 290 p += sprintf(p, "Physical Memory Map-out event"); 291 break; 292 default: 293 p += sprintf(p, "reserved error (%d)", 294 mem_err->error_type); 295 } 296 } else { 297 strcpy(pvt->msg, "unknown error"); 298 } 299 300 /* Error address */ 301 if (mem_err->validation_bits & CPER_MEM_VALID_PA) { 302 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; 303 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; 304 } 305 306 /* Error grain */ 307 if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) 308 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); 309 310 /* Memory error location, mapped on e->location */ 311 p = e->location; 312 if (mem_err->validation_bits & CPER_MEM_VALID_NODE) 313 p += sprintf(p, "node:%d ", mem_err->node); 314 if (mem_err->validation_bits & CPER_MEM_VALID_CARD) 315 p += sprintf(p, "card:%d ", mem_err->card); 316 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE) 317 p += sprintf(p, "module:%d ", mem_err->module); 318 if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER) 319 p += sprintf(p, "rank:%d ", mem_err->rank); 320 if (mem_err->validation_bits & CPER_MEM_VALID_BANK) 321 p += sprintf(p, "bank:%d ", mem_err->bank); 322 if (mem_err->validation_bits & CPER_MEM_VALID_ROW) 323 p += sprintf(p, "row:%d ", mem_err->row); 324 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) 325 p += sprintf(p, "col:%d ", mem_err->column); 326 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) 327 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); 328 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { 329 const char *bank = NULL, *device = NULL; 330 dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device); 331 if (bank != NULL && device != NULL) 332 p += sprintf(p, "DIMM location:%s %s ", bank, device); 333 else 334 p += sprintf(p, "DIMM DMI handle: 0x%.4x ", 335 mem_err->mem_dev_handle); 336 } 337 if (p > e->location) 338 *(p - 1) = '\0'; 339 340 /* All other fields are mapped on e->other_detail */ 341 p = pvt->other_detail; 342 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { 343 u64 status = mem_err->error_status; 344 345 p += sprintf(p, "status(0x%016llx): ", (long long)status); 346 switch ((status >> 8) & 0xff) { 347 case 1: 348 p += sprintf(p, "Error detected internal to the component "); 349 break; 350 case 16: 351 p += sprintf(p, "Error detected in the bus "); 352 break; 353 case 4: 354 p += sprintf(p, "Storage error in DRAM memory "); 355 break; 356 case 5: 357 p += sprintf(p, "Storage error in TLB "); 358 break; 359 case 6: 360 p += sprintf(p, "Storage error in cache "); 361 break; 362 case 7: 363 p += sprintf(p, "Error in one or more functional units "); 364 break; 365 case 8: 366 p += sprintf(p, "component failed self test "); 367 break; 368 case 9: 369 p += sprintf(p, "Overflow or undervalue of internal queue "); 370 break; 371 case 17: 372 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR "); 373 break; 374 case 18: 375 p += sprintf(p, "Improper access error "); 376 break; 377 case 19: 378 p += sprintf(p, "Access to a memory address which is not mapped to any component "); 379 break; 380 case 20: 381 p += sprintf(p, "Loss of Lockstep "); 382 break; 383 case 21: 384 p += sprintf(p, "Response not associated with a request "); 385 break; 386 case 22: 387 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits "); 388 break; 389 case 23: 390 p += sprintf(p, "Detection of a PATH_ERROR "); 391 break; 392 case 25: 393 p += sprintf(p, "Bus operation timeout "); 394 break; 395 case 26: 396 p += sprintf(p, "A read was issued to data that has been poisoned "); 397 break; 398 default: 399 p += sprintf(p, "reserved "); 400 break; 401 } 402 } 403 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) 404 p += sprintf(p, "requestorID: 0x%016llx ", 405 (long long)mem_err->requestor_id); 406 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID) 407 p += sprintf(p, "responderID: 0x%016llx ", 408 (long long)mem_err->responder_id); 409 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID) 410 p += sprintf(p, "targetID: 0x%016llx ", 411 (long long)mem_err->responder_id); 412 if (p > pvt->other_detail) 413 *(p - 1) = '\0'; 414 415 /* Generate the trace event */ 416 grain_bits = fls_long(e->grain); 417 snprintf(pvt->detail_location, sizeof(pvt->detail_location), 418 "APEI location: %s %s", e->location, e->other_detail); 419 trace_mc_event(type, e->msg, e->label, e->error_count, 420 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, 421 (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, 422 grain_bits, e->syndrome, pvt->detail_location); 423 424 edac_raw_mc_handle_error(type, mci, e); 425 spin_unlock_irqrestore(&ghes_lock, flags); 426 } 427 428 /* 429 * Known systems that are safe to enable this module. 430 */ 431 static struct acpi_platform_list plat_list[] = { 432 {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, 433 { } /* End */ 434 }; 435 436 int ghes_edac_register(struct ghes *ghes, struct device *dev) 437 { 438 bool fake = false; 439 int rc, num_dimm = 0; 440 struct mem_ctl_info *mci; 441 struct edac_mc_layer layers[1]; 442 struct ghes_edac_dimm_fill dimm_fill; 443 int idx = -1; 444 445 if (IS_ENABLED(CONFIG_X86)) { 446 /* Check if safe to enable on this system */ 447 idx = acpi_match_platform_list(plat_list); 448 if (!force_load && idx < 0) 449 return -ENODEV; 450 } else { 451 idx = 0; 452 } 453 454 /* 455 * We have only one logical memory controller to which all DIMMs belong. 456 */ 457 if (atomic_inc_return(&ghes_init) > 1) 458 return 0; 459 460 /* Get the number of DIMMs */ 461 dmi_walk(ghes_edac_count_dimms, &num_dimm); 462 463 /* Check if we've got a bogus BIOS */ 464 if (num_dimm == 0) { 465 fake = true; 466 num_dimm = 1; 467 } 468 469 layers[0].type = EDAC_MC_LAYER_ALL_MEM; 470 layers[0].size = num_dimm; 471 layers[0].is_virt_csrow = true; 472 473 mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); 474 if (!mci) { 475 pr_info("Can't allocate memory for EDAC data\n"); 476 return -ENOMEM; 477 } 478 479 ghes_pvt = mci->pvt_info; 480 ghes_pvt->ghes = ghes; 481 ghes_pvt->mci = mci; 482 483 mci->pdev = dev; 484 mci->mtype_cap = MEM_FLAG_EMPTY; 485 mci->edac_ctl_cap = EDAC_FLAG_NONE; 486 mci->edac_cap = EDAC_FLAG_NONE; 487 mci->mod_name = "ghes_edac.c"; 488 mci->ctl_name = "ghes_edac"; 489 mci->dev_name = "ghes"; 490 491 if (fake) { 492 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); 493 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); 494 pr_info("work on such system. Use this driver with caution\n"); 495 } else if (idx < 0) { 496 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); 497 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); 498 pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); 499 pr_info("If you find incorrect reports, please contact your hardware vendor\n"); 500 pr_info("to correct its BIOS.\n"); 501 pr_info("This system has %d DIMM sockets.\n", num_dimm); 502 } 503 504 if (!fake) { 505 dimm_fill.count = 0; 506 dimm_fill.mci = mci; 507 dmi_walk(ghes_edac_dmidecode, &dimm_fill); 508 } else { 509 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, 510 mci->n_layers, 0, 0, 0); 511 512 dimm->nr_pages = 1; 513 dimm->grain = 128; 514 dimm->mtype = MEM_UNKNOWN; 515 dimm->dtype = DEV_UNKNOWN; 516 dimm->edac_mode = EDAC_SECDED; 517 } 518 519 rc = edac_mc_add_mc(mci); 520 if (rc < 0) { 521 pr_info("Can't register at EDAC core\n"); 522 edac_mc_free(mci); 523 return -ENODEV; 524 } 525 return 0; 526 } 527 528 void ghes_edac_unregister(struct ghes *ghes) 529 { 530 struct mem_ctl_info *mci; 531 532 if (!ghes_pvt) 533 return; 534 535 mci = ghes_pvt->mci; 536 edac_mc_del_mc(mci->pdev); 537 edac_mc_free(mci); 538 } 539