1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * FRU (Field-Replaceable Unit) Memory Poison Manager 4 * 5 * Copyright (c) 2024, Advanced Micro Devices, Inc. 6 * All Rights Reserved. 7 * 8 * Authors: 9 * Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com> 10 * Muralidhara M K <muralidhara.mk@amd.com> 11 * Yazen Ghannam <Yazen.Ghannam@amd.com> 12 * 13 * Implementation notes, assumptions, and limitations: 14 * 15 * - FRU memory poison section and memory poison descriptor definitions are not yet 16 * included in the UEFI specification. So they are defined here. Afterwards, they 17 * may be moved to linux/cper.h, if appropriate. 18 * 19 * - Platforms based on AMD MI300 systems will be the first to use these structures. 20 * There are a number of assumptions made here that will need to be generalized 21 * to support other platforms. 22 * 23 * AMD MI300-based platform(s) assumptions: 24 * - Memory errors are reported through x86 MCA. 25 * - The entire DRAM row containing a memory error should be retired. 26 * - There will be (1) FRU memory poison section per CPER. 27 * - The FRU will be the CPU package (processor socket). 28 * - The default number of memory poison descriptor entries should be (8). 29 * - The platform will use ACPI ERST for persistent storage. 30 * - All FRU records should be saved to persistent storage. Module init will 31 * fail if any FRU record is not successfully written. 32 * 33 * - Boot time memory retirement may occur later than ideal due to dependencies 34 * on other libraries and drivers. This leaves a gap where bad memory may be 35 * accessed during early boot stages. 36 * 37 * - Enough memory should be pre-allocated for each FRU record to be able to hold 38 * the expected number of descriptor entries. This, mostly empty, record is 39 * written to storage during init time. Subsequent writes to the same record 40 * should allow the Platform to update the stored record in-place. Otherwise, 41 * if the record is extended, then the Platform may need to perform costly memory 42 * management operations on the storage. For example, the Platform may spend time 43 * in Firmware copying and invalidating memory on a relatively slow SPI ROM. 44 */ 45 46 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 47 48 #include <linux/cper.h> 49 #include <linux/ras.h> 50 #include <linux/cpu.h> 51 52 #include <acpi/apei.h> 53 54 #include <asm/cpu_device_id.h> 55 #include <asm/mce.h> 56 57 #include "../debugfs.h" 58 59 #define INVALID_CPU UINT_MAX 60 61 /* Validation Bits */ 62 #define FMP_VALID_ARCH_TYPE BIT_ULL(0) 63 #define FMP_VALID_ARCH BIT_ULL(1) 64 #define FMP_VALID_ID_TYPE BIT_ULL(2) 65 #define FMP_VALID_ID BIT_ULL(3) 66 #define FMP_VALID_LIST_ENTRIES BIT_ULL(4) 67 #define FMP_VALID_LIST BIT_ULL(5) 68 69 /* FRU Architecture Types */ 70 #define FMP_ARCH_TYPE_X86_CPUID_1_EAX 0 71 72 /* FRU ID Types */ 73 #define FMP_ID_TYPE_X86_PPIN 0 74 75 /* FRU Memory Poison Section */ 76 struct cper_sec_fru_mem_poison { 77 u32 checksum; 78 u64 validation_bits; 79 u32 fru_arch_type; 80 u64 fru_arch; 81 u32 fru_id_type; 82 u64 fru_id; 83 u32 nr_entries; 84 } __packed; 85 86 /* FRU Descriptor ID Types */ 87 #define FPD_HW_ID_TYPE_MCA_IPID 0 88 89 /* FRU Descriptor Address Types */ 90 #define FPD_ADDR_TYPE_MCA_ADDR 0 91 92 /* Memory Poison Descriptor */ 93 struct cper_fru_poison_desc { 94 u64 timestamp; 95 u32 hw_id_type; 96 u64 hw_id; 97 u32 addr_type; 98 u64 addr; 99 } __packed; 100 101 /* Collection of headers and sections for easy pointer use. */ 102 struct fru_rec { 103 struct cper_record_header hdr; 104 struct cper_section_descriptor sec_desc; 105 struct cper_sec_fru_mem_poison fmp; 106 struct cper_fru_poison_desc entries[]; 107 } __packed; 108 109 /* 110 * Pointers to the complete CPER record of each FRU. 111 * 112 * Memory allocation will include padded space for descriptor entries. 113 */ 114 static struct fru_rec **fru_records; 115 116 /* system physical addresses array */ 117 static u64 *spa_entries; 118 119 #define INVALID_SPA ~0ULL 120 121 static struct dentry *fmpm_dfs_dir; 122 static struct dentry *fmpm_dfs_entries; 123 124 #define CPER_CREATOR_FMP \ 125 GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \ 126 0xa0, 0x33, 0x08, 0x75) 127 128 #define CPER_SECTION_TYPE_FMP \ 129 GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2, \ 130 0x12, 0x0a, 0x44, 0x58) 131 132 /** 133 * DOC: max_nr_entries (byte) 134 * Maximum number of descriptor entries possible for each FRU. 135 * 136 * Values between '1' and '255' are valid. 137 * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES. 138 */ 139 static u8 max_nr_entries; 140 module_param(max_nr_entries, byte, 0644); 141 MODULE_PARM_DESC(max_nr_entries, 142 "Maximum number of memory poison descriptor entries per FRU"); 143 144 #define FMPM_DEFAULT_MAX_NR_ENTRIES 8 145 146 /* Maximum number of FRUs in the system. */ 147 #define FMPM_MAX_NR_FRU 256 148 static unsigned int max_nr_fru; 149 150 /* Total length of record including headers and list of descriptor entries. */ 151 static size_t max_rec_len; 152 153 /* Total number of SPA entries across all FRUs. */ 154 static unsigned int spa_nr_entries; 155 156 /* 157 * Protect the local records cache in fru_records and prevent concurrent 158 * writes to storage. This is only needed after init once notifier block 159 * registration is done. 160 * 161 * The majority of a record is fixed at module init and will not change 162 * during run time. The entries within a record will be updated as new 163 * errors are reported. The mutex should be held whenever the entries are 164 * accessed during run time. 165 */ 166 static DEFINE_MUTEX(fmpm_update_mutex); 167 168 #define for_each_fru(i, rec) \ 169 for (i = 0; rec = fru_records[i], i < max_nr_fru; i++) 170 171 static inline u32 get_fmp_len(struct fru_rec *rec) 172 { 173 return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor); 174 } 175 176 static struct fru_rec *get_fru_record(u64 fru_id) 177 { 178 struct fru_rec *rec; 179 unsigned int i; 180 181 for_each_fru(i, rec) { 182 if (rec->fmp.fru_id == fru_id) 183 return rec; 184 } 185 186 pr_debug("Record not found for FRU 0x%016llx\n", fru_id); 187 188 return NULL; 189 } 190 191 /* 192 * Sum up all bytes within the FRU Memory Poison Section including the Memory 193 * Poison Descriptor entries. 194 * 195 * Don't include the old checksum here. It's a u32 value, so summing each of its 196 * bytes will give the wrong total. 197 */ 198 static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len) 199 { 200 u32 checksum = 0; 201 u8 *buf, *end; 202 203 /* Skip old checksum. */ 204 buf = (u8 *)fmp + sizeof(u32); 205 end = buf + len; 206 207 while (buf < end) 208 checksum += (u8)(*(buf++)); 209 210 return checksum; 211 } 212 213 static int update_record_on_storage(struct fru_rec *rec) 214 { 215 u32 len, checksum; 216 int ret; 217 218 /* Calculate a new checksum. */ 219 len = get_fmp_len(rec); 220 221 /* Get the current total. */ 222 checksum = do_fmp_checksum(&rec->fmp, len); 223 224 /* Use the complement value. */ 225 rec->fmp.checksum = -checksum; 226 227 pr_debug("Writing to storage\n"); 228 229 ret = erst_write(&rec->hdr); 230 if (ret) { 231 pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id); 232 233 if (ret == -ENOSPC) 234 pr_warn("Not enough space on storage\n"); 235 } 236 237 return ret; 238 } 239 240 static bool rec_has_valid_entries(struct fru_rec *rec) 241 { 242 if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES)) 243 return false; 244 245 if (!(rec->fmp.validation_bits & FMP_VALID_LIST)) 246 return false; 247 248 return true; 249 } 250 251 static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new) 252 { 253 /* 254 * Ignore timestamp field. 255 * The same physical error may be reported multiple times due to stuck bits, etc. 256 * 257 * Also, order the checks from most->least likely to fail to shortcut the code. 258 */ 259 if (old->addr != new->addr) 260 return false; 261 262 if (old->hw_id != new->hw_id) 263 return false; 264 265 if (old->addr_type != new->addr_type) 266 return false; 267 268 if (old->hw_id_type != new->hw_id_type) 269 return false; 270 271 return true; 272 } 273 274 static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd) 275 { 276 unsigned int i; 277 278 for (i = 0; i < rec->fmp.nr_entries; i++) { 279 struct cper_fru_poison_desc *fpd_i = &rec->entries[i]; 280 281 if (fpds_equal(fpd_i, fpd)) { 282 pr_debug("Found duplicate record\n"); 283 return true; 284 } 285 } 286 287 return false; 288 } 289 290 static void save_spa(struct fru_rec *rec, unsigned int entry, 291 u64 addr, u64 id, unsigned int cpu) 292 { 293 unsigned int i, fru_idx, spa_entry; 294 struct atl_err a_err; 295 unsigned long spa; 296 297 if (entry >= max_nr_entries) { 298 pr_warn_once("FRU descriptor entry %d out-of-bounds (max: %d)\n", 299 entry, max_nr_entries); 300 return; 301 } 302 303 /* spa_nr_entries is always multiple of max_nr_entries */ 304 for (i = 0; i < spa_nr_entries; i += max_nr_entries) { 305 fru_idx = i / max_nr_entries; 306 if (fru_records[fru_idx] == rec) 307 break; 308 } 309 310 if (i >= spa_nr_entries) { 311 pr_warn_once("FRU record %d not found\n", i); 312 return; 313 } 314 315 spa_entry = i + entry; 316 if (spa_entry >= spa_nr_entries) { 317 pr_warn_once("spa_entries[] index out-of-bounds\n"); 318 return; 319 } 320 321 memset(&a_err, 0, sizeof(struct atl_err)); 322 323 a_err.addr = addr; 324 a_err.ipid = id; 325 a_err.cpu = cpu; 326 327 spa = amd_convert_umc_mca_addr_to_sys_addr(&a_err); 328 if (IS_ERR_VALUE(spa)) { 329 pr_debug("Failed to get system address\n"); 330 return; 331 } 332 333 spa_entries[spa_entry] = spa; 334 pr_debug("fru_idx: %u, entry: %u, spa_entry: %u, spa: 0x%016llx\n", 335 fru_idx, entry, spa_entry, spa_entries[spa_entry]); 336 } 337 338 static void update_fru_record(struct fru_rec *rec, struct mce *m) 339 { 340 struct cper_sec_fru_mem_poison *fmp = &rec->fmp; 341 struct cper_fru_poison_desc fpd, *fpd_dest; 342 u32 entry = 0; 343 344 mutex_lock(&fmpm_update_mutex); 345 346 memset(&fpd, 0, sizeof(struct cper_fru_poison_desc)); 347 348 fpd.timestamp = m->time; 349 fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID; 350 fpd.hw_id = m->ipid; 351 fpd.addr_type = FPD_ADDR_TYPE_MCA_ADDR; 352 fpd.addr = m->addr; 353 354 /* This is the first entry, so just save it. */ 355 if (!rec_has_valid_entries(rec)) 356 goto save_fpd; 357 358 /* Ignore already recorded errors. */ 359 if (rec_has_fpd(rec, &fpd)) 360 goto out_unlock; 361 362 if (rec->fmp.nr_entries >= max_nr_entries) { 363 pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id); 364 goto out_unlock; 365 } 366 367 entry = fmp->nr_entries; 368 369 save_fpd: 370 save_spa(rec, entry, m->addr, m->ipid, m->extcpu); 371 fpd_dest = &rec->entries[entry]; 372 memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc)); 373 374 fmp->nr_entries = entry + 1; 375 fmp->validation_bits |= FMP_VALID_LIST_ENTRIES; 376 fmp->validation_bits |= FMP_VALID_LIST; 377 378 pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry); 379 380 update_record_on_storage(rec); 381 382 out_unlock: 383 mutex_unlock(&fmpm_update_mutex); 384 } 385 386 static void retire_dram_row(u64 addr, u64 id, u32 cpu) 387 { 388 struct atl_err a_err; 389 390 memset(&a_err, 0, sizeof(struct atl_err)); 391 392 a_err.addr = addr; 393 a_err.ipid = id; 394 a_err.cpu = cpu; 395 396 amd_retire_dram_row(&a_err); 397 } 398 399 static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data) 400 { 401 struct mce *m = (struct mce *)data; 402 struct fru_rec *rec; 403 404 if (!mce_is_memory_error(m)) 405 return NOTIFY_DONE; 406 407 retire_dram_row(m->addr, m->ipid, m->extcpu); 408 409 /* 410 * An invalid FRU ID should not happen on real errors. But it 411 * could happen from software error injection, etc. 412 */ 413 rec = get_fru_record(m->ppin); 414 if (!rec) 415 return NOTIFY_DONE; 416 417 update_fru_record(rec, m); 418 419 return NOTIFY_OK; 420 } 421 422 static struct notifier_block fru_mem_poison_nb = { 423 .notifier_call = fru_handle_mem_poison, 424 .priority = MCE_PRIO_LOWEST, 425 }; 426 427 static void retire_mem_fmp(struct fru_rec *rec) 428 { 429 struct cper_sec_fru_mem_poison *fmp = &rec->fmp; 430 unsigned int i, cpu; 431 432 for (i = 0; i < fmp->nr_entries; i++) { 433 struct cper_fru_poison_desc *fpd = &rec->entries[i]; 434 unsigned int err_cpu = INVALID_CPU; 435 436 if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID) 437 continue; 438 439 if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR) 440 continue; 441 442 cpus_read_lock(); 443 for_each_online_cpu(cpu) { 444 if (topology_ppin(cpu) == fmp->fru_id) { 445 err_cpu = cpu; 446 break; 447 } 448 } 449 cpus_read_unlock(); 450 451 if (err_cpu == INVALID_CPU) 452 continue; 453 454 retire_dram_row(fpd->addr, fpd->hw_id, err_cpu); 455 save_spa(rec, i, fpd->addr, fpd->hw_id, err_cpu); 456 } 457 } 458 459 static void retire_mem_records(void) 460 { 461 struct fru_rec *rec; 462 unsigned int i; 463 464 for_each_fru(i, rec) { 465 if (!rec_has_valid_entries(rec)) 466 continue; 467 468 retire_mem_fmp(rec); 469 } 470 } 471 472 /* Set the CPER Record Header and CPER Section Descriptor fields. */ 473 static void set_rec_fields(struct fru_rec *rec) 474 { 475 struct cper_section_descriptor *sec_desc = &rec->sec_desc; 476 struct cper_record_header *hdr = &rec->hdr; 477 478 memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE); 479 hdr->revision = CPER_RECORD_REV; 480 hdr->signature_end = CPER_SIG_END; 481 482 /* 483 * Currently, it is assumed that there is one FRU Memory Poison 484 * section per CPER. But this may change for other implementations. 485 */ 486 hdr->section_count = 1; 487 488 /* The logged errors are recoverable. Otherwise, they'd never make it here. */ 489 hdr->error_severity = CPER_SEV_RECOVERABLE; 490 491 hdr->validation_bits = 0; 492 hdr->record_length = max_rec_len; 493 hdr->creator_id = CPER_CREATOR_FMP; 494 hdr->notification_type = CPER_NOTIFY_MCE; 495 hdr->record_id = cper_next_record_id(); 496 hdr->flags = CPER_HW_ERROR_FLAGS_PREVERR; 497 498 sec_desc->section_offset = sizeof(struct cper_record_header); 499 sec_desc->section_length = max_rec_len - sizeof(struct cper_record_header); 500 sec_desc->revision = CPER_SEC_REV; 501 sec_desc->validation_bits = 0; 502 sec_desc->flags = CPER_SEC_PRIMARY; 503 sec_desc->section_type = CPER_SECTION_TYPE_FMP; 504 sec_desc->section_severity = CPER_SEV_RECOVERABLE; 505 } 506 507 static int save_new_records(void) 508 { 509 DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU); 510 struct fru_rec *rec; 511 unsigned int i; 512 int ret = 0; 513 514 for_each_fru(i, rec) { 515 if (rec->hdr.record_length) 516 continue; 517 518 set_rec_fields(rec); 519 520 ret = update_record_on_storage(rec); 521 if (ret) 522 goto out_clear; 523 524 set_bit(i, new_records); 525 } 526 527 return ret; 528 529 out_clear: 530 for_each_fru(i, rec) { 531 if (!test_bit(i, new_records)) 532 continue; 533 534 erst_clear(rec->hdr.record_id); 535 } 536 537 return ret; 538 } 539 540 /* Check that the record matches expected types for the current system.*/ 541 static bool fmp_is_usable(struct fru_rec *rec) 542 { 543 struct cper_sec_fru_mem_poison *fmp = &rec->fmp; 544 u64 cpuid; 545 546 pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits); 547 548 if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) { 549 pr_debug("Arch type unknown\n"); 550 return false; 551 } 552 553 if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) { 554 pr_debug("Arch type not 'x86 Family/Model/Stepping'\n"); 555 return false; 556 } 557 558 if (!(fmp->validation_bits & FMP_VALID_ARCH)) { 559 pr_debug("Arch value unknown\n"); 560 return false; 561 } 562 563 cpuid = cpuid_eax(1); 564 if (fmp->fru_arch != cpuid) { 565 pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n", 566 fmp->fru_arch, cpuid); 567 return false; 568 } 569 570 if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) { 571 pr_debug("FRU ID type unknown\n"); 572 return false; 573 } 574 575 if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) { 576 pr_debug("FRU ID type is not 'x86 PPIN'\n"); 577 return false; 578 } 579 580 if (!(fmp->validation_bits & FMP_VALID_ID)) { 581 pr_debug("FRU ID value unknown\n"); 582 return false; 583 } 584 585 return true; 586 } 587 588 static bool fmp_is_valid(struct fru_rec *rec) 589 { 590 struct cper_sec_fru_mem_poison *fmp = &rec->fmp; 591 u32 checksum, len; 592 593 len = get_fmp_len(rec); 594 if (len < sizeof(struct cper_sec_fru_mem_poison)) { 595 pr_debug("fmp length is too small\n"); 596 return false; 597 } 598 599 /* Checksum must sum to zero for the entire section. */ 600 checksum = do_fmp_checksum(fmp, len) + fmp->checksum; 601 if (checksum) { 602 pr_debug("fmp checksum failed: sum = 0x%x\n", checksum); 603 print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false); 604 return false; 605 } 606 607 if (!fmp_is_usable(rec)) 608 return false; 609 610 return true; 611 } 612 613 static struct fru_rec *get_valid_record(struct fru_rec *old) 614 { 615 struct fru_rec *new; 616 617 if (!fmp_is_valid(old)) { 618 pr_debug("Ignoring invalid record\n"); 619 return NULL; 620 } 621 622 new = get_fru_record(old->fmp.fru_id); 623 if (!new) 624 pr_debug("Ignoring record for absent FRU\n"); 625 626 return new; 627 } 628 629 /* 630 * Fetch saved records from persistent storage. 631 * 632 * For each found record: 633 * - If it was not created by this module, then ignore it. 634 * - If it is valid, then copy its data to the local cache. 635 * - If it is not valid, then erase it. 636 */ 637 static int get_saved_records(void) 638 { 639 struct fru_rec *old, *new; 640 u64 record_id; 641 int ret, pos; 642 ssize_t len; 643 644 /* 645 * Assume saved records match current max size. 646 * 647 * However, this may not be true depending on module parameters. 648 */ 649 old = kmalloc(max_rec_len, GFP_KERNEL); 650 if (!old) { 651 ret = -ENOMEM; 652 goto out; 653 } 654 655 ret = erst_get_record_id_begin(&pos); 656 if (ret < 0) 657 goto out_end; 658 659 while (!erst_get_record_id_next(&pos, &record_id)) { 660 if (record_id == APEI_ERST_INVALID_RECORD_ID) 661 goto out_end; 662 /* 663 * Make sure to clear temporary buffer between reads to avoid 664 * leftover data from records of various sizes. 665 */ 666 memset(old, 0, max_rec_len); 667 668 len = erst_read_record(record_id, &old->hdr, max_rec_len, 669 sizeof(struct fru_rec), &CPER_CREATOR_FMP); 670 if (len < 0) 671 continue; 672 673 if (len > max_rec_len) { 674 pr_debug("Found record larger than max_rec_len\n"); 675 continue; 676 } 677 678 new = get_valid_record(old); 679 if (!new) 680 erst_clear(record_id); 681 682 /* Restore the record */ 683 memcpy(new, old, len); 684 } 685 686 out_end: 687 erst_get_record_id_end(); 688 kfree(old); 689 out: 690 return ret; 691 } 692 693 static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu) 694 { 695 struct cper_sec_fru_mem_poison *fmp = &rec->fmp; 696 697 fmp->fru_arch_type = FMP_ARCH_TYPE_X86_CPUID_1_EAX; 698 fmp->validation_bits |= FMP_VALID_ARCH_TYPE; 699 700 /* Assume all CPUs in the system have the same value for now. */ 701 fmp->fru_arch = cpuid_eax(1); 702 fmp->validation_bits |= FMP_VALID_ARCH; 703 704 fmp->fru_id_type = FMP_ID_TYPE_X86_PPIN; 705 fmp->validation_bits |= FMP_VALID_ID_TYPE; 706 707 fmp->fru_id = topology_ppin(cpu); 708 fmp->validation_bits |= FMP_VALID_ID; 709 } 710 711 static int init_fmps(void) 712 { 713 struct fru_rec *rec; 714 unsigned int i, cpu; 715 int ret = 0; 716 717 for_each_fru(i, rec) { 718 unsigned int fru_cpu = INVALID_CPU; 719 720 cpus_read_lock(); 721 for_each_online_cpu(cpu) { 722 if (topology_physical_package_id(cpu) == i) { 723 fru_cpu = cpu; 724 break; 725 } 726 } 727 cpus_read_unlock(); 728 729 if (fru_cpu == INVALID_CPU) { 730 pr_debug("Failed to find matching CPU for FRU #%u\n", i); 731 ret = -ENODEV; 732 break; 733 } 734 735 set_fmp_fields(rec, fru_cpu); 736 } 737 738 return ret; 739 } 740 741 static int get_system_info(void) 742 { 743 /* Only load on MI300A systems for now. */ 744 if (!(boot_cpu_data.x86_model >= 0x90 && 745 boot_cpu_data.x86_model <= 0x9f)) 746 return -ENODEV; 747 748 if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) { 749 pr_debug("PPIN feature not available\n"); 750 return -ENODEV; 751 } 752 753 /* Use CPU socket as FRU for MI300 systems. */ 754 max_nr_fru = topology_max_packages(); 755 if (!max_nr_fru) 756 return -ENODEV; 757 758 if (max_nr_fru > FMPM_MAX_NR_FRU) { 759 pr_warn("Too many FRUs to manage: found: %u, max: %u\n", 760 max_nr_fru, FMPM_MAX_NR_FRU); 761 return -ENODEV; 762 } 763 764 if (!max_nr_entries) 765 max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES; 766 767 spa_nr_entries = max_nr_fru * max_nr_entries; 768 769 max_rec_len = sizeof(struct fru_rec); 770 max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries; 771 772 pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n", 773 max_nr_fru, max_nr_entries, max_rec_len); 774 775 return 0; 776 } 777 778 static void free_records(void) 779 { 780 struct fru_rec *rec; 781 int i; 782 783 for_each_fru(i, rec) 784 kfree(rec); 785 786 kfree(fru_records); 787 kfree(spa_entries); 788 } 789 790 static int allocate_records(void) 791 { 792 int i, ret = 0; 793 794 fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL); 795 if (!fru_records) { 796 ret = -ENOMEM; 797 goto out; 798 } 799 800 for (i = 0; i < max_nr_fru; i++) { 801 fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL); 802 if (!fru_records[i]) { 803 ret = -ENOMEM; 804 goto out_free; 805 } 806 } 807 808 spa_entries = kcalloc(spa_nr_entries, sizeof(u64), GFP_KERNEL); 809 if (!spa_entries) { 810 ret = -ENOMEM; 811 goto out_free; 812 } 813 814 for (i = 0; i < spa_nr_entries; i++) 815 spa_entries[i] = INVALID_SPA; 816 817 return ret; 818 819 out_free: 820 while (--i >= 0) 821 kfree(fru_records[i]); 822 823 kfree(fru_records); 824 out: 825 return ret; 826 } 827 828 static void *fmpm_start(struct seq_file *f, loff_t *pos) 829 { 830 if (*pos >= (spa_nr_entries + 1)) 831 return NULL; 832 return pos; 833 } 834 835 static void *fmpm_next(struct seq_file *f, void *data, loff_t *pos) 836 { 837 if (++(*pos) >= (spa_nr_entries + 1)) 838 return NULL; 839 return pos; 840 } 841 842 static void fmpm_stop(struct seq_file *f, void *data) 843 { 844 } 845 846 #define SHORT_WIDTH 8 847 #define U64_WIDTH 18 848 #define TIMESTAMP_WIDTH 19 849 #define LONG_WIDTH 24 850 #define U64_PAD (LONG_WIDTH - U64_WIDTH) 851 #define TS_PAD (LONG_WIDTH - TIMESTAMP_WIDTH) 852 static int fmpm_show(struct seq_file *f, void *data) 853 { 854 unsigned int fru_idx, entry, spa_entry, line; 855 struct cper_fru_poison_desc *fpd; 856 struct fru_rec *rec; 857 858 line = *(loff_t *)data; 859 if (line == 0) { 860 seq_printf(f, "%-*s", SHORT_WIDTH, "fru_idx"); 861 seq_printf(f, "%-*s", LONG_WIDTH, "fru_id"); 862 seq_printf(f, "%-*s", SHORT_WIDTH, "entry"); 863 seq_printf(f, "%-*s", LONG_WIDTH, "timestamp"); 864 seq_printf(f, "%-*s", LONG_WIDTH, "hw_id"); 865 seq_printf(f, "%-*s", LONG_WIDTH, "addr"); 866 seq_printf(f, "%-*s", LONG_WIDTH, "spa"); 867 goto out_newline; 868 } 869 870 spa_entry = line - 1; 871 fru_idx = spa_entry / max_nr_entries; 872 entry = spa_entry % max_nr_entries; 873 874 rec = fru_records[fru_idx]; 875 if (!rec) 876 goto out; 877 878 seq_printf(f, "%-*u", SHORT_WIDTH, fru_idx); 879 seq_printf(f, "0x%016llx%-*s", rec->fmp.fru_id, U64_PAD, ""); 880 seq_printf(f, "%-*u", SHORT_WIDTH, entry); 881 882 mutex_lock(&fmpm_update_mutex); 883 884 if (entry >= rec->fmp.nr_entries) { 885 seq_printf(f, "%-*s", LONG_WIDTH, "*"); 886 seq_printf(f, "%-*s", LONG_WIDTH, "*"); 887 seq_printf(f, "%-*s", LONG_WIDTH, "*"); 888 seq_printf(f, "%-*s", LONG_WIDTH, "*"); 889 goto out_unlock; 890 } 891 892 fpd = &rec->entries[entry]; 893 894 seq_printf(f, "%ptT%-*s", &fpd->timestamp, TS_PAD, ""); 895 seq_printf(f, "0x%016llx%-*s", fpd->hw_id, U64_PAD, ""); 896 seq_printf(f, "0x%016llx%-*s", fpd->addr, U64_PAD, ""); 897 898 if (spa_entries[spa_entry] == INVALID_SPA) 899 seq_printf(f, "%-*s", LONG_WIDTH, "*"); 900 else 901 seq_printf(f, "0x%016llx%-*s", spa_entries[spa_entry], U64_PAD, ""); 902 903 out_unlock: 904 mutex_unlock(&fmpm_update_mutex); 905 out_newline: 906 seq_putc(f, '\n'); 907 out: 908 return 0; 909 } 910 911 static const struct seq_operations fmpm_seq_ops = { 912 .start = fmpm_start, 913 .next = fmpm_next, 914 .stop = fmpm_stop, 915 .show = fmpm_show, 916 }; 917 918 static int fmpm_open(struct inode *inode, struct file *file) 919 { 920 return seq_open(file, &fmpm_seq_ops); 921 } 922 923 static const struct file_operations fmpm_fops = { 924 .open = fmpm_open, 925 .release = seq_release, 926 .read = seq_read, 927 .llseek = seq_lseek, 928 }; 929 930 static void setup_debugfs(void) 931 { 932 struct dentry *dfs = ras_get_debugfs_root(); 933 934 if (!dfs) 935 return; 936 937 fmpm_dfs_dir = debugfs_create_dir("fmpm", dfs); 938 if (!fmpm_dfs_dir) 939 return; 940 941 fmpm_dfs_entries = debugfs_create_file("entries", 0400, fmpm_dfs_dir, NULL, &fmpm_fops); 942 if (!fmpm_dfs_entries) 943 debugfs_remove(fmpm_dfs_dir); 944 } 945 946 static const struct x86_cpu_id fmpm_cpuids[] = { 947 X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL), 948 { } 949 }; 950 MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids); 951 952 static int __init fru_mem_poison_init(void) 953 { 954 int ret; 955 956 if (!x86_match_cpu(fmpm_cpuids)) { 957 ret = -ENODEV; 958 goto out; 959 } 960 961 if (erst_disable) { 962 pr_debug("ERST not available\n"); 963 ret = -ENODEV; 964 goto out; 965 } 966 967 ret = get_system_info(); 968 if (ret) 969 goto out; 970 971 ret = allocate_records(); 972 if (ret) 973 goto out; 974 975 ret = init_fmps(); 976 if (ret) 977 goto out_free; 978 979 ret = get_saved_records(); 980 if (ret) 981 goto out_free; 982 983 ret = save_new_records(); 984 if (ret) 985 goto out_free; 986 987 setup_debugfs(); 988 989 retire_mem_records(); 990 991 mce_register_decode_chain(&fru_mem_poison_nb); 992 993 pr_info("FRU Memory Poison Manager initialized\n"); 994 return 0; 995 996 out_free: 997 free_records(); 998 out: 999 return ret; 1000 } 1001 1002 static void __exit fru_mem_poison_exit(void) 1003 { 1004 mce_unregister_decode_chain(&fru_mem_poison_nb); 1005 debugfs_remove(fmpm_dfs_dir); 1006 free_records(); 1007 } 1008 1009 module_init(fru_mem_poison_init); 1010 module_exit(fru_mem_poison_exit); 1011 1012 MODULE_LICENSE("GPL"); 1013 MODULE_DESCRIPTION("FRU Memory Poison Manager"); 1014