xref: /linux/drivers/ras/amd/fmpm.c (revision dd61b55d733eee9bbe51abe7ab0e6f2ce1fae332)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * FRU (Field-Replaceable Unit) Memory Poison Manager
4  *
5  * Copyright (c) 2024, Advanced Micro Devices, Inc.
6  * All Rights Reserved.
7  *
8  * Authors:
9  *	Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
10  *	Muralidhara M K <muralidhara.mk@amd.com>
11  *	Yazen Ghannam <Yazen.Ghannam@amd.com>
12  *
13  * Implementation notes, assumptions, and limitations:
14  *
15  * - FRU memory poison section and memory poison descriptor definitions are not yet
16  *   included in the UEFI specification. So they are defined here. Afterwards, they
17  *   may be moved to linux/cper.h, if appropriate.
18  *
19  * - Platforms based on AMD MI300 systems will be the first to use these structures.
20  *   There are a number of assumptions made here that will need to be generalized
21  *   to support other platforms.
22  *
23  *   AMD MI300-based platform(s) assumptions:
24  *   - Memory errors are reported through x86 MCA.
25  *   - The entire DRAM row containing a memory error should be retired.
26  *   - There will be (1) FRU memory poison section per CPER.
27  *   - The FRU will be the CPU package (processor socket).
28  *   - The default number of memory poison descriptor entries should be (8).
29  *   - The platform will use ACPI ERST for persistent storage.
30  *   - All FRU records should be saved to persistent storage. Module init will
31  *     fail if any FRU record is not successfully written.
32  *
33  * - Boot time memory retirement may occur later than ideal due to dependencies
34  *   on other libraries and drivers. This leaves a gap where bad memory may be
35  *   accessed during early boot stages.
36  *
37  * - Enough memory should be pre-allocated for each FRU record to be able to hold
38  *   the expected number of descriptor entries. This, mostly empty, record is
39  *   written to storage during init time. Subsequent writes to the same record
40  *   should allow the Platform to update the stored record in-place. Otherwise,
41  *   if the record is extended, then the Platform may need to perform costly memory
42  *   management operations on the storage. For example, the Platform may spend time
43  *   in Firmware copying and invalidating memory on a relatively slow SPI ROM.
44  */
45 
46 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
47 
48 #include <linux/cper.h>
49 #include <linux/ras.h>
50 #include <linux/cpu.h>
51 
52 #include <acpi/apei.h>
53 
54 #include <asm/cpu_device_id.h>
55 #include <asm/mce.h>
56 
57 #define INVALID_CPU			UINT_MAX
58 
59 /* Validation Bits */
60 #define FMP_VALID_ARCH_TYPE		BIT_ULL(0)
61 #define FMP_VALID_ARCH			BIT_ULL(1)
62 #define FMP_VALID_ID_TYPE		BIT_ULL(2)
63 #define FMP_VALID_ID			BIT_ULL(3)
64 #define FMP_VALID_LIST_ENTRIES		BIT_ULL(4)
65 #define FMP_VALID_LIST			BIT_ULL(5)
66 
67 /* FRU Architecture Types */
68 #define FMP_ARCH_TYPE_X86_CPUID_1_EAX	0
69 
70 /* FRU ID Types */
71 #define FMP_ID_TYPE_X86_PPIN		0
72 
73 /* FRU Memory Poison Section */
74 struct cper_sec_fru_mem_poison {
75 	u32 checksum;
76 	u64 validation_bits;
77 	u32 fru_arch_type;
78 	u64 fru_arch;
79 	u32 fru_id_type;
80 	u64 fru_id;
81 	u32 nr_entries;
82 } __packed;
83 
84 /* FRU Descriptor ID Types */
85 #define FPD_HW_ID_TYPE_MCA_IPID		0
86 
87 /* FRU Descriptor Address Types */
88 #define FPD_ADDR_TYPE_MCA_ADDR		0
89 
90 /* Memory Poison Descriptor */
91 struct cper_fru_poison_desc {
92 	u64 timestamp;
93 	u32 hw_id_type;
94 	u64 hw_id;
95 	u32 addr_type;
96 	u64 addr;
97 } __packed;
98 
99 /* Collection of headers and sections for easy pointer use. */
100 struct fru_rec {
101 	struct cper_record_header	hdr;
102 	struct cper_section_descriptor	sec_desc;
103 	struct cper_sec_fru_mem_poison	fmp;
104 	struct cper_fru_poison_desc	entries[];
105 } __packed;
106 
107 /*
108  * Pointers to the complete CPER record of each FRU.
109  *
110  * Memory allocation will include padded space for descriptor entries.
111  */
112 static struct fru_rec **fru_records;
113 
114 #define CPER_CREATOR_FMP						\
115 	GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3,	\
116 		  0xa0, 0x33, 0x08, 0x75)
117 
118 #define CPER_SECTION_TYPE_FMP						\
119 	GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2,	\
120 		  0x12, 0x0a, 0x44, 0x58)
121 
122 /**
123  * DOC: fru_poison_entries (byte)
124  * Maximum number of descriptor entries possible for each FRU.
125  *
126  * Values between '1' and '255' are valid.
127  * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES.
128  */
129 static u8 max_nr_entries;
130 module_param(max_nr_entries, byte, 0644);
131 MODULE_PARM_DESC(max_nr_entries,
132 		 "Maximum number of memory poison descriptor entries per FRU");
133 
134 #define FMPM_DEFAULT_MAX_NR_ENTRIES	8
135 
136 /* Maximum number of FRUs in the system. */
137 #define FMPM_MAX_NR_FRU			256
138 static unsigned int max_nr_fru;
139 
140 /* Total length of record including headers and list of descriptor entries. */
141 static size_t max_rec_len;
142 
143 /*
144  * Protect the local records cache in fru_records and prevent concurrent
145  * writes to storage. This is only needed after init once notifier block
146  * registration is done.
147  */
148 static DEFINE_MUTEX(fmpm_update_mutex);
149 
150 #define for_each_fru(i, rec) \
151 	for (i = 0; rec = fru_records[i], i < max_nr_fru; i++)
152 
153 static inline u32 get_fmp_len(struct fru_rec *rec)
154 {
155 	return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor);
156 }
157 
158 static struct fru_rec *get_fru_record(u64 fru_id)
159 {
160 	struct fru_rec *rec;
161 	unsigned int i;
162 
163 	for_each_fru(i, rec) {
164 		if (rec->fmp.fru_id == fru_id)
165 			return rec;
166 	}
167 
168 	pr_debug("Record not found for FRU 0x%016llx\n", fru_id);
169 
170 	return NULL;
171 }
172 
173 /*
174  * Sum up all bytes within the FRU Memory Poison Section including the Memory
175  * Poison Descriptor entries.
176  *
177  * Don't include the old checksum here. It's a u32 value, so summing each of its
178  * bytes will give the wrong total.
179  */
180 static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len)
181 {
182 	u32 checksum = 0;
183 	u8 *buf, *end;
184 
185 	/* Skip old checksum. */
186 	buf = (u8 *)fmp + sizeof(u32);
187 	end = buf + len;
188 
189 	while (buf < end)
190 		checksum += (u8)(*(buf++));
191 
192 	return checksum;
193 }
194 
195 static int update_record_on_storage(struct fru_rec *rec)
196 {
197 	u32 len, checksum;
198 	int ret;
199 
200 	/* Calculate a new checksum. */
201 	len = get_fmp_len(rec);
202 
203 	/* Get the current total. */
204 	checksum = do_fmp_checksum(&rec->fmp, len);
205 
206 	/* Use the complement value. */
207 	rec->fmp.checksum = -checksum;
208 
209 	pr_debug("Writing to storage\n");
210 
211 	ret = erst_write(&rec->hdr);
212 	if (ret) {
213 		pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id);
214 
215 		if (ret == -ENOSPC)
216 			pr_warn("Not enough space on storage\n");
217 	}
218 
219 	return ret;
220 }
221 
222 static bool rec_has_valid_entries(struct fru_rec *rec)
223 {
224 	if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES))
225 		return false;
226 
227 	if (!(rec->fmp.validation_bits & FMP_VALID_LIST))
228 		return false;
229 
230 	return true;
231 }
232 
233 static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
234 {
235 	/*
236 	 * Ignore timestamp field.
237 	 * The same physical error may be reported multiple times due to stuck bits, etc.
238 	 *
239 	 * Also, order the checks from most->least likely to fail to shortcut the code.
240 	 */
241 	if (old->addr != new->addr)
242 		return false;
243 
244 	if (old->hw_id != new->hw_id)
245 		return false;
246 
247 	if (old->addr_type != new->addr_type)
248 		return false;
249 
250 	if (old->hw_id_type != new->hw_id_type)
251 		return false;
252 
253 	return true;
254 }
255 
256 static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
257 {
258 	unsigned int i;
259 
260 	for (i = 0; i < rec->fmp.nr_entries; i++) {
261 		struct cper_fru_poison_desc *fpd_i = &rec->entries[i];
262 
263 		if (fpds_equal(fpd_i, fpd)) {
264 			pr_debug("Found duplicate record\n");
265 			return true;
266 		}
267 	}
268 
269 	return false;
270 }
271 
272 static void update_fru_record(struct fru_rec *rec, struct mce *m)
273 {
274 	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
275 	struct cper_fru_poison_desc fpd, *fpd_dest;
276 	u32 entry = 0;
277 
278 	mutex_lock(&fmpm_update_mutex);
279 
280 	memset(&fpd, 0, sizeof(struct cper_fru_poison_desc));
281 
282 	fpd.timestamp	= m->time;
283 	fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID;
284 	fpd.hw_id	= m->ipid;
285 	fpd.addr_type	= FPD_ADDR_TYPE_MCA_ADDR;
286 	fpd.addr	= m->addr;
287 
288 	/* This is the first entry, so just save it. */
289 	if (!rec_has_valid_entries(rec))
290 		goto save_fpd;
291 
292 	/* Ignore already recorded errors. */
293 	if (rec_has_fpd(rec, &fpd))
294 		goto out_unlock;
295 
296 	if (rec->fmp.nr_entries >= max_nr_entries) {
297 		pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id);
298 		goto out_unlock;
299 	}
300 
301 	entry  = fmp->nr_entries;
302 
303 save_fpd:
304 	fpd_dest  = &rec->entries[entry];
305 	memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));
306 
307 	fmp->nr_entries		 = entry + 1;
308 	fmp->validation_bits	|= FMP_VALID_LIST_ENTRIES;
309 	fmp->validation_bits	|= FMP_VALID_LIST;
310 
311 	pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry);
312 
313 	update_record_on_storage(rec);
314 
315 out_unlock:
316 	mutex_unlock(&fmpm_update_mutex);
317 }
318 
319 static void retire_dram_row(u64 addr, u64 id, u32 cpu)
320 {
321 	struct atl_err a_err;
322 
323 	memset(&a_err, 0, sizeof(struct atl_err));
324 
325 	a_err.addr = addr;
326 	a_err.ipid = id;
327 	a_err.cpu  = cpu;
328 
329 	amd_retire_dram_row(&a_err);
330 }
331 
332 static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data)
333 {
334 	struct mce *m = (struct mce *)data;
335 	struct fru_rec *rec;
336 
337 	if (!mce_is_memory_error(m))
338 		return NOTIFY_DONE;
339 
340 	retire_dram_row(m->addr, m->ipid, m->extcpu);
341 
342 	/*
343 	 * An invalid FRU ID should not happen on real errors. But it
344 	 * could happen from software error injection, etc.
345 	 */
346 	rec = get_fru_record(m->ppin);
347 	if (!rec)
348 		return NOTIFY_DONE;
349 
350 	update_fru_record(rec, m);
351 
352 	return NOTIFY_OK;
353 }
354 
355 static struct notifier_block fru_mem_poison_nb = {
356 	.notifier_call  = fru_handle_mem_poison,
357 	.priority	= MCE_PRIO_LOWEST,
358 };
359 
360 static void retire_mem_fmp(struct fru_rec *rec)
361 {
362 	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
363 	unsigned int i, cpu;
364 
365 	for (i = 0; i < fmp->nr_entries; i++) {
366 		struct cper_fru_poison_desc *fpd = &rec->entries[i];
367 		unsigned int err_cpu = INVALID_CPU;
368 
369 		if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID)
370 			continue;
371 
372 		if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR)
373 			continue;
374 
375 		cpus_read_lock();
376 		for_each_online_cpu(cpu) {
377 			if (topology_ppin(cpu) == fmp->fru_id) {
378 				err_cpu = cpu;
379 				break;
380 			}
381 		}
382 		cpus_read_unlock();
383 
384 		if (err_cpu == INVALID_CPU)
385 			continue;
386 
387 		retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
388 	}
389 }
390 
391 static void retire_mem_records(void)
392 {
393 	struct fru_rec *rec;
394 	unsigned int i;
395 
396 	for_each_fru(i, rec) {
397 		if (!rec_has_valid_entries(rec))
398 			continue;
399 
400 		retire_mem_fmp(rec);
401 	}
402 }
403 
404 /* Set the CPER Record Header and CPER Section Descriptor fields. */
405 static void set_rec_fields(struct fru_rec *rec)
406 {
407 	struct cper_section_descriptor	*sec_desc = &rec->sec_desc;
408 	struct cper_record_header	*hdr	  = &rec->hdr;
409 
410 	memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
411 	hdr->revision			= CPER_RECORD_REV;
412 	hdr->signature_end		= CPER_SIG_END;
413 
414 	/*
415 	 * Currently, it is assumed that there is one FRU Memory Poison
416 	 * section per CPER. But this may change for other implementations.
417 	 */
418 	hdr->section_count		= 1;
419 
420 	/* The logged errors are recoverable. Otherwise, they'd never make it here. */
421 	hdr->error_severity		= CPER_SEV_RECOVERABLE;
422 
423 	hdr->validation_bits		= 0;
424 	hdr->record_length		= max_rec_len;
425 	hdr->creator_id			= CPER_CREATOR_FMP;
426 	hdr->notification_type		= CPER_NOTIFY_MCE;
427 	hdr->record_id			= cper_next_record_id();
428 	hdr->flags			= CPER_HW_ERROR_FLAGS_PREVERR;
429 
430 	sec_desc->section_offset	= sizeof(struct cper_record_header);
431 	sec_desc->section_length	= max_rec_len - sizeof(struct cper_record_header);
432 	sec_desc->revision		= CPER_SEC_REV;
433 	sec_desc->validation_bits	= 0;
434 	sec_desc->flags			= CPER_SEC_PRIMARY;
435 	sec_desc->section_type		= CPER_SECTION_TYPE_FMP;
436 	sec_desc->section_severity	= CPER_SEV_RECOVERABLE;
437 }
438 
439 static int save_new_records(void)
440 {
441 	DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU);
442 	struct fru_rec *rec;
443 	unsigned int i;
444 	int ret = 0;
445 
446 	for_each_fru(i, rec) {
447 		if (rec->hdr.record_length)
448 			continue;
449 
450 		set_rec_fields(rec);
451 
452 		ret = update_record_on_storage(rec);
453 		if (ret)
454 			goto out_clear;
455 
456 		set_bit(i, new_records);
457 	}
458 
459 	return ret;
460 
461 out_clear:
462 	for_each_fru(i, rec) {
463 		if (!test_bit(i, new_records))
464 			continue;
465 
466 		erst_clear(rec->hdr.record_id);
467 	}
468 
469 	return ret;
470 }
471 
472 /* Check that the record matches expected types for the current system.*/
473 static bool fmp_is_usable(struct fru_rec *rec)
474 {
475 	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
476 	u64 cpuid;
477 
478 	pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits);
479 
480 	if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) {
481 		pr_debug("Arch type unknown\n");
482 		return false;
483 	}
484 
485 	if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) {
486 		pr_debug("Arch type not 'x86 Family/Model/Stepping'\n");
487 		return false;
488 	}
489 
490 	if (!(fmp->validation_bits & FMP_VALID_ARCH)) {
491 		pr_debug("Arch value unknown\n");
492 		return false;
493 	}
494 
495 	cpuid = cpuid_eax(1);
496 	if (fmp->fru_arch != cpuid) {
497 		pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n",
498 			 fmp->fru_arch, cpuid);
499 		return false;
500 	}
501 
502 	if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) {
503 		pr_debug("FRU ID type unknown\n");
504 		return false;
505 	}
506 
507 	if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) {
508 		pr_debug("FRU ID type is not 'x86 PPIN'\n");
509 		return false;
510 	}
511 
512 	if (!(fmp->validation_bits & FMP_VALID_ID)) {
513 		pr_debug("FRU ID value unknown\n");
514 		return false;
515 	}
516 
517 	return true;
518 }
519 
520 static bool fmp_is_valid(struct fru_rec *rec)
521 {
522 	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
523 	u32 checksum, len;
524 
525 	len = get_fmp_len(rec);
526 	if (len < sizeof(struct cper_sec_fru_mem_poison)) {
527 		pr_debug("fmp length is too small\n");
528 		return false;
529 	}
530 
531 	/* Checksum must sum to zero for the entire section. */
532 	checksum = do_fmp_checksum(fmp, len) + fmp->checksum;
533 	if (checksum) {
534 		pr_debug("fmp checksum failed: sum = 0x%x\n", checksum);
535 		print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false);
536 		return false;
537 	}
538 
539 	if (!fmp_is_usable(rec))
540 		return false;
541 
542 	return true;
543 }
544 
545 static struct fru_rec *get_valid_record(struct fru_rec *old)
546 {
547 	struct fru_rec *new;
548 
549 	if (!fmp_is_valid(old)) {
550 		pr_debug("Ignoring invalid record\n");
551 		return NULL;
552 	}
553 
554 	new = get_fru_record(old->fmp.fru_id);
555 	if (!new)
556 		pr_debug("Ignoring record for absent FRU\n");
557 
558 	return new;
559 }
560 
561 /*
562  * Fetch saved records from persistent storage.
563  *
564  * For each found record:
565  * - If it was not created by this module, then ignore it.
566  * - If it is valid, then copy its data to the local cache.
567  * - If it is not valid, then erase it.
568  */
569 static int get_saved_records(void)
570 {
571 	struct fru_rec *old, *new;
572 	u64 record_id;
573 	int ret, pos;
574 	ssize_t len;
575 
576 	/*
577 	 * Assume saved records match current max size.
578 	 *
579 	 * However, this may not be true depending on module parameters.
580 	 */
581 	old = kmalloc(max_rec_len, GFP_KERNEL);
582 	if (!old) {
583 		ret = -ENOMEM;
584 		goto out;
585 	}
586 
587 	ret = erst_get_record_id_begin(&pos);
588 	if (ret < 0)
589 		goto out_end;
590 
591 	while (!erst_get_record_id_next(&pos, &record_id)) {
592 		if (record_id == APEI_ERST_INVALID_RECORD_ID)
593 			goto out_end;
594 		/*
595 		 * Make sure to clear temporary buffer between reads to avoid
596 		 * leftover data from records of various sizes.
597 		 */
598 		memset(old, 0, max_rec_len);
599 
600 		len = erst_read_record(record_id, &old->hdr, max_rec_len,
601 				       sizeof(struct fru_rec), &CPER_CREATOR_FMP);
602 		if (len < 0)
603 			continue;
604 
605 		if (len > max_rec_len) {
606 			pr_debug("Found record larger than max_rec_len\n");
607 			continue;
608 		}
609 
610 		new = get_valid_record(old);
611 		if (!new)
612 			erst_clear(record_id);
613 
614 		/* Restore the record */
615 		memcpy(new, old, len);
616 	}
617 
618 out_end:
619 	erst_get_record_id_end();
620 	kfree(old);
621 out:
622 	return ret;
623 }
624 
625 static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu)
626 {
627 	struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
628 
629 	fmp->fru_arch_type    = FMP_ARCH_TYPE_X86_CPUID_1_EAX;
630 	fmp->validation_bits |= FMP_VALID_ARCH_TYPE;
631 
632 	/* Assume all CPUs in the system have the same value for now. */
633 	fmp->fru_arch	      = cpuid_eax(1);
634 	fmp->validation_bits |= FMP_VALID_ARCH;
635 
636 	fmp->fru_id_type      = FMP_ID_TYPE_X86_PPIN;
637 	fmp->validation_bits |= FMP_VALID_ID_TYPE;
638 
639 	fmp->fru_id	      = topology_ppin(cpu);
640 	fmp->validation_bits |= FMP_VALID_ID;
641 }
642 
643 static int init_fmps(void)
644 {
645 	struct fru_rec *rec;
646 	unsigned int i, cpu;
647 	int ret = 0;
648 
649 	for_each_fru(i, rec) {
650 		unsigned int fru_cpu = INVALID_CPU;
651 
652 		cpus_read_lock();
653 		for_each_online_cpu(cpu) {
654 			if (topology_physical_package_id(cpu) == i) {
655 				fru_cpu = cpu;
656 				break;
657 			}
658 		}
659 		cpus_read_unlock();
660 
661 		if (fru_cpu == INVALID_CPU) {
662 			pr_debug("Failed to find matching CPU for FRU #%u\n", i);
663 			ret = -ENODEV;
664 			break;
665 		}
666 
667 		set_fmp_fields(rec, fru_cpu);
668 	}
669 
670 	return ret;
671 }
672 
673 static int get_system_info(void)
674 {
675 	/* Only load on MI300A systems for now. */
676 	if (!(boot_cpu_data.x86_model >= 0x90 &&
677 	      boot_cpu_data.x86_model <= 0x9f))
678 		return -ENODEV;
679 
680 	if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) {
681 		pr_debug("PPIN feature not available\n");
682 		return -ENODEV;
683 	}
684 
685 	/* Use CPU socket as FRU for MI300 systems. */
686 	max_nr_fru = topology_max_packages();
687 	if (!max_nr_fru)
688 		return -ENODEV;
689 
690 	if (max_nr_fru > FMPM_MAX_NR_FRU) {
691 		pr_warn("Too many FRUs to manage: found: %u, max: %u\n",
692 			max_nr_fru, FMPM_MAX_NR_FRU);
693 		return -ENODEV;
694 	}
695 
696 	if (!max_nr_entries)
697 		max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
698 
699 	max_rec_len  = sizeof(struct fru_rec);
700 	max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
701 
702 	pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n",
703 		 max_nr_fru, max_nr_entries, max_rec_len);
704 
705 	return 0;
706 }
707 
708 static void free_records(void)
709 {
710 	struct fru_rec *rec;
711 	int i;
712 
713 	for_each_fru(i, rec)
714 		kfree(rec);
715 
716 	kfree(fru_records);
717 }
718 
719 static int allocate_records(void)
720 {
721 	int i, ret = 0;
722 
723 	fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL);
724 	if (!fru_records) {
725 		ret = -ENOMEM;
726 		goto out;
727 	}
728 
729 	for (i = 0; i < max_nr_fru; i++) {
730 		fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL);
731 		if (!fru_records[i]) {
732 			ret = -ENOMEM;
733 			goto out_free;
734 		}
735 	}
736 
737 	return ret;
738 
739 out_free:
740 	for (; i >= 0; i--)
741 		kfree(fru_records[i]);
742 
743 	kfree(fru_records);
744 out:
745 	return ret;
746 }
747 
748 static const struct x86_cpu_id fmpm_cpuids[] = {
749 	X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),
750 	{ }
751 };
752 MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids);
753 
754 static int __init fru_mem_poison_init(void)
755 {
756 	int ret;
757 
758 	if (!x86_match_cpu(fmpm_cpuids)) {
759 		ret = -ENODEV;
760 		goto out;
761 	}
762 
763 	if (erst_disable) {
764 		pr_debug("ERST not available\n");
765 		ret = -ENODEV;
766 		goto out;
767 	}
768 
769 	ret = get_system_info();
770 	if (ret)
771 		goto out;
772 
773 	ret = allocate_records();
774 	if (ret)
775 		goto out;
776 
777 	ret = init_fmps();
778 	if (ret)
779 		goto out_free;
780 
781 	ret = get_saved_records();
782 	if (ret)
783 		goto out_free;
784 
785 	ret = save_new_records();
786 	if (ret)
787 		goto out_free;
788 
789 	retire_mem_records();
790 
791 	mce_register_decode_chain(&fru_mem_poison_nb);
792 
793 	pr_info("FRU Memory Poison Manager initialized\n");
794 	return 0;
795 
796 out_free:
797 	free_records();
798 out:
799 	return ret;
800 }
801 
802 static void __exit fru_mem_poison_exit(void)
803 {
804 	mce_unregister_decode_chain(&fru_mem_poison_nb);
805 	free_records();
806 }
807 
808 module_init(fru_mem_poison_init);
809 module_exit(fru_mem_poison_exit);
810 
811 MODULE_LICENSE("GPL");
812 MODULE_DESCRIPTION("FRU Memory Poison Manager");
813