xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
192d5d2a0SHawking Zhang // SPDX-License-Identifier: GPL-2.0
292d5d2a0SHawking Zhang /*
392d5d2a0SHawking Zhang  * Copyright 2025 Advanced Micro Devices, Inc.
492d5d2a0SHawking Zhang  *
592d5d2a0SHawking Zhang  * Permission is hereby granted, free of charge, to any person obtaining a
692d5d2a0SHawking Zhang  * copy of this software and associated documentation files (the "Software"),
792d5d2a0SHawking Zhang  * to deal in the Software without restriction, including without limitation
892d5d2a0SHawking Zhang  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
992d5d2a0SHawking Zhang  * and/or sell copies of the Software, and to permit persons to whom the
1092d5d2a0SHawking Zhang  * Software is furnished to do so, subject to the following conditions:
1192d5d2a0SHawking Zhang  *
1292d5d2a0SHawking Zhang  * The above copyright notice and this permission notice shall be included in
1392d5d2a0SHawking Zhang  * all copies or substantial portions of the Software.
1492d5d2a0SHawking Zhang  *
1592d5d2a0SHawking Zhang  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1692d5d2a0SHawking Zhang  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1792d5d2a0SHawking Zhang  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1892d5d2a0SHawking Zhang  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
1992d5d2a0SHawking Zhang  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
2092d5d2a0SHawking Zhang  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2192d5d2a0SHawking Zhang  * OTHER DEALINGS IN THE SOFTWARE.
2292d5d2a0SHawking Zhang  *
2392d5d2a0SHawking Zhang  */
24ad97840fSHawking Zhang #include <linux/list.h>
2592d5d2a0SHawking Zhang #include "amdgpu.h"
2692d5d2a0SHawking Zhang 
2792d5d2a0SHawking Zhang static const guid_t MCE			= CPER_NOTIFY_MCE;
2892d5d2a0SHawking Zhang static const guid_t CMC			= CPER_NOTIFY_CMC;
2992d5d2a0SHawking Zhang static const guid_t BOOT		= BOOT_TYPE;
3092d5d2a0SHawking Zhang 
3192d5d2a0SHawking Zhang static const guid_t CRASHDUMP		= AMD_CRASHDUMP;
3292d5d2a0SHawking Zhang static const guid_t RUNTIME		= AMD_GPU_NONSTANDARD_ERROR;
3392d5d2a0SHawking Zhang 
3492d5d2a0SHawking Zhang static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
3592d5d2a0SHawking Zhang {
3692d5d2a0SHawking Zhang 	hdr->record_length += size;
3792d5d2a0SHawking Zhang }
3892d5d2a0SHawking Zhang 
39b3060f5bSXiang Liu static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)
40b3060f5bSXiang Liu {
41b3060f5bSXiang Liu 	struct tm tm;
42b3060f5bSXiang Liu 	time64_t now = ktime_get_real_seconds();
43b3060f5bSXiang Liu 
44b3060f5bSXiang Liu 	time64_to_tm(now, 0, &tm);
45b3060f5bSXiang Liu 	timestamp->seconds = tm.tm_sec;
46b3060f5bSXiang Liu 	timestamp->minutes = tm.tm_min;
47b3060f5bSXiang Liu 	timestamp->hours = tm.tm_hour;
48b3060f5bSXiang Liu 	timestamp->flag = 0;
49b3060f5bSXiang Liu 	timestamp->day = tm.tm_mday;
50b3060f5bSXiang Liu 	timestamp->month = 1 + tm.tm_mon;
51b3060f5bSXiang Liu 	timestamp->year = (1900 + tm.tm_year) % 100;
52b3060f5bSXiang Liu 	timestamp->century = (1900 + tm.tm_year) / 100;
53b3060f5bSXiang Liu }
54b3060f5bSXiang Liu 
5592d5d2a0SHawking Zhang void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
5692d5d2a0SHawking Zhang 				struct cper_hdr *hdr,
5792d5d2a0SHawking Zhang 				enum amdgpu_cper_type type,
5892d5d2a0SHawking Zhang 				enum cper_error_severity sev)
5992d5d2a0SHawking Zhang {
60148084bbSXiang Liu 	char record_id[16];
61148084bbSXiang Liu 
6292d5d2a0SHawking Zhang 	hdr->signature[0]		= 'C';
6392d5d2a0SHawking Zhang 	hdr->signature[1]		= 'P';
6492d5d2a0SHawking Zhang 	hdr->signature[2]		= 'E';
6592d5d2a0SHawking Zhang 	hdr->signature[3]		= 'R';
6692d5d2a0SHawking Zhang 	hdr->revision			= CPER_HDR_REV_1;
6792d5d2a0SHawking Zhang 	hdr->signature_end		= 0xFFFFFFFF;
6892d5d2a0SHawking Zhang 	hdr->error_severity		= sev;
6992d5d2a0SHawking Zhang 
7092d5d2a0SHawking Zhang 	hdr->valid_bits.platform_id	= 1;
7192d5d2a0SHawking Zhang 	hdr->valid_bits.partition_id	= 1;
7292d5d2a0SHawking Zhang 	hdr->valid_bits.timestamp	= 1;
73b3060f5bSXiang Liu 
74b3060f5bSXiang Liu 	amdgpu_cper_get_timestamp(&hdr->timestamp);
7592d5d2a0SHawking Zhang 
76148084bbSXiang Liu 	snprintf(record_id, 9, "%d:%X",
77148084bbSXiang Liu 		 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
78148084bbSXiang Liu 			 adev->smuio.funcs->get_socket_id(adev) :
79148084bbSXiang Liu 			 0,
80148084bbSXiang Liu 		 atomic_inc_return(&adev->cper.unique_id));
81148084bbSXiang Liu 	memcpy(hdr->record_id, record_id, 8);
82148084bbSXiang Liu 
8392d5d2a0SHawking Zhang 	snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
8492d5d2a0SHawking Zhang 		 adev->pdev->vendor, adev->pdev->device);
8592d5d2a0SHawking Zhang 	/* pmfw version should be part of creator_id according to CPER spec */
8692d5d2a0SHawking Zhang 	snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
8792d5d2a0SHawking Zhang 
8892d5d2a0SHawking Zhang 	switch (type) {
8992d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_BOOT:
9092d5d2a0SHawking Zhang 		hdr->notify_type = BOOT;
9192d5d2a0SHawking Zhang 		break;
9292d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_FATAL:
9392d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
9492d5d2a0SHawking Zhang 		hdr->notify_type = MCE;
9592d5d2a0SHawking Zhang 		break;
9692d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_RUNTIME:
9792d5d2a0SHawking Zhang 		if (sev == CPER_SEV_NON_FATAL_CORRECTED)
9892d5d2a0SHawking Zhang 			hdr->notify_type = CMC;
9992d5d2a0SHawking Zhang 		else
10092d5d2a0SHawking Zhang 			hdr->notify_type = MCE;
10192d5d2a0SHawking Zhang 		break;
10292d5d2a0SHawking Zhang 	default:
10392d5d2a0SHawking Zhang 		dev_err(adev->dev, "Unknown CPER Type\n");
10492d5d2a0SHawking Zhang 		break;
10592d5d2a0SHawking Zhang 	}
10692d5d2a0SHawking Zhang 
10792d5d2a0SHawking Zhang 	__inc_entry_length(hdr, HDR_LEN);
10892d5d2a0SHawking Zhang }
10992d5d2a0SHawking Zhang 
11092d5d2a0SHawking Zhang static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
11192d5d2a0SHawking Zhang 					       struct cper_sec_desc *section_desc,
11292d5d2a0SHawking Zhang 					       bool bp_threshold,
11392d5d2a0SHawking Zhang 					       bool poison,
11492d5d2a0SHawking Zhang 					       enum cper_error_severity sev,
11592d5d2a0SHawking Zhang 					       guid_t sec_type,
11692d5d2a0SHawking Zhang 					       uint32_t section_length,
11792d5d2a0SHawking Zhang 					       uint32_t section_offset)
11892d5d2a0SHawking Zhang {
11992d5d2a0SHawking Zhang 	section_desc->revision_minor		= CPER_SEC_MINOR_REV_1;
12092d5d2a0SHawking Zhang 	section_desc->revision_major		= CPER_SEC_MAJOR_REV_22;
12192d5d2a0SHawking Zhang 	section_desc->sec_offset		= section_offset;
12292d5d2a0SHawking Zhang 	section_desc->sec_length		= section_length;
12392d5d2a0SHawking Zhang 	section_desc->valid_bits.fru_text	= 1;
12492d5d2a0SHawking Zhang 	section_desc->flag_bits.primary		= 1;
12592d5d2a0SHawking Zhang 	section_desc->severity			= sev;
12692d5d2a0SHawking Zhang 	section_desc->sec_type			= sec_type;
12792d5d2a0SHawking Zhang 
12892d5d2a0SHawking Zhang 	snprintf(section_desc->fru_text, 20, "OAM%d",
129148084bbSXiang Liu 		 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
130148084bbSXiang Liu 			 adev->smuio.funcs->get_socket_id(adev) :
131148084bbSXiang Liu 			 0);
13292d5d2a0SHawking Zhang 
13392d5d2a0SHawking Zhang 	if (bp_threshold)
13492d5d2a0SHawking Zhang 		section_desc->flag_bits.exceed_err_threshold = 1;
13592d5d2a0SHawking Zhang 	if (poison)
13692d5d2a0SHawking Zhang 		section_desc->flag_bits.latent_err = 1;
13792d5d2a0SHawking Zhang 
13892d5d2a0SHawking Zhang 	return 0;
13992d5d2a0SHawking Zhang }
14092d5d2a0SHawking Zhang 
14192d5d2a0SHawking Zhang int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
14292d5d2a0SHawking Zhang 					 struct cper_hdr *hdr,
14392d5d2a0SHawking Zhang 					 uint32_t idx,
14492d5d2a0SHawking Zhang 					 struct cper_sec_crashdump_reg_data reg_data)
14592d5d2a0SHawking Zhang {
14692d5d2a0SHawking Zhang 	struct cper_sec_desc *section_desc;
14792d5d2a0SHawking Zhang 	struct cper_sec_crashdump_fatal *section;
14892d5d2a0SHawking Zhang 
14992d5d2a0SHawking Zhang 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
15092d5d2a0SHawking Zhang 	section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
15192d5d2a0SHawking Zhang 		   FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
15292d5d2a0SHawking Zhang 
15392d5d2a0SHawking Zhang 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
15492d5d2a0SHawking Zhang 					    CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
15592d5d2a0SHawking Zhang 					    FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
15692d5d2a0SHawking Zhang 
15792d5d2a0SHawking Zhang 	section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
15892d5d2a0SHawking Zhang 	section->body.reg_arr_size = sizeof(reg_data);
15992d5d2a0SHawking Zhang 	section->body.data = reg_data;
16092d5d2a0SHawking Zhang 
16192d5d2a0SHawking Zhang 	__inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
16292d5d2a0SHawking Zhang 
16392d5d2a0SHawking Zhang 	return 0;
16492d5d2a0SHawking Zhang }
16592d5d2a0SHawking Zhang 
16692d5d2a0SHawking Zhang int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
16792d5d2a0SHawking Zhang 					   struct cper_hdr *hdr,
16892d5d2a0SHawking Zhang 					   uint32_t idx,
16992d5d2a0SHawking Zhang 					   enum cper_error_severity sev,
17092d5d2a0SHawking Zhang 					   uint32_t *reg_dump,
17192d5d2a0SHawking Zhang 					   uint32_t reg_count)
17292d5d2a0SHawking Zhang {
17392d5d2a0SHawking Zhang 	struct cper_sec_desc *section_desc;
17492d5d2a0SHawking Zhang 	struct cper_sec_nonstd_err *section;
17592d5d2a0SHawking Zhang 	bool poison;
17692d5d2a0SHawking Zhang 
17792d5d2a0SHawking Zhang 	poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true;
17892d5d2a0SHawking Zhang 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
17992d5d2a0SHawking Zhang 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
18092d5d2a0SHawking Zhang 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
18192d5d2a0SHawking Zhang 
18292d5d2a0SHawking Zhang 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
18392d5d2a0SHawking Zhang 					    sev, RUNTIME, NONSTD_SEC_LEN,
18492d5d2a0SHawking Zhang 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
18592d5d2a0SHawking Zhang 
186a6d9d192STao Zhou 	reg_count = umin(reg_count, CPER_ACA_REG_COUNT);
18792d5d2a0SHawking Zhang 
18892d5d2a0SHawking Zhang 	section->hdr.valid_bits.err_info_cnt = 1;
18992d5d2a0SHawking Zhang 	section->hdr.valid_bits.err_context_cnt = 1;
19092d5d2a0SHawking Zhang 
19192d5d2a0SHawking Zhang 	section->info.error_type = RUNTIME;
19292d5d2a0SHawking Zhang 	section->info.ms_chk_bits.err_type_valid = 1;
19392d5d2a0SHawking Zhang 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
19492d5d2a0SHawking Zhang 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
19592d5d2a0SHawking Zhang 
19692d5d2a0SHawking Zhang 	memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
19792d5d2a0SHawking Zhang 
19892d5d2a0SHawking Zhang 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
19992d5d2a0SHawking Zhang 
20092d5d2a0SHawking Zhang 	return 0;
20192d5d2a0SHawking Zhang }
20292d5d2a0SHawking Zhang 
20392d5d2a0SHawking Zhang int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
20492d5d2a0SHawking Zhang 						      struct cper_hdr *hdr,
20592d5d2a0SHawking Zhang 						      uint32_t idx)
20692d5d2a0SHawking Zhang {
20792d5d2a0SHawking Zhang 	struct cper_sec_desc *section_desc;
20892d5d2a0SHawking Zhang 	struct cper_sec_nonstd_err *section;
20992d5d2a0SHawking Zhang 
21092d5d2a0SHawking Zhang 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
21192d5d2a0SHawking Zhang 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
21292d5d2a0SHawking Zhang 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
21392d5d2a0SHawking Zhang 
21492d5d2a0SHawking Zhang 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
215f9d35b94SXiang Liu 					    CPER_SEV_NUM, RUNTIME, NONSTD_SEC_LEN,
21692d5d2a0SHawking Zhang 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
21792d5d2a0SHawking Zhang 
21892d5d2a0SHawking Zhang 	section->hdr.valid_bits.err_info_cnt = 1;
21992d5d2a0SHawking Zhang 	section->hdr.valid_bits.err_context_cnt = 1;
22092d5d2a0SHawking Zhang 
22192d5d2a0SHawking Zhang 	section->info.error_type = RUNTIME;
22292d5d2a0SHawking Zhang 	section->info.ms_chk_bits.err_type_valid = 1;
22392d5d2a0SHawking Zhang 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
22492d5d2a0SHawking Zhang 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
22592d5d2a0SHawking Zhang 
22692d5d2a0SHawking Zhang 	/* Hardcoded Reg dump for bad page threshold CPER */
22792d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_CTL_LO]    = 0x1;
22892d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_CTL_HI]    = 0x0;
22992d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
23092d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
23192d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO]   = 0x0;
23292d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI]   = 0x0;
23392d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO]  = 0x0;
23492d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI]  = 0x0;
23592d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
23692d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
23792d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_IPID_LO]   = 0x0;
23892d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_IPID_HI]   = 0x96;
23992d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_SYND_LO]   = 0x0;
24092d5d2a0SHawking Zhang 	section->ctx.reg_dump[CPER_ACA_REG_SYND_HI]   = 0x0;
24192d5d2a0SHawking Zhang 
24292d5d2a0SHawking Zhang 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
24392d5d2a0SHawking Zhang 
24492d5d2a0SHawking Zhang 	return 0;
24592d5d2a0SHawking Zhang }
24692d5d2a0SHawking Zhang 
24792d5d2a0SHawking Zhang struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
24892d5d2a0SHawking Zhang 					 enum amdgpu_cper_type type,
24992d5d2a0SHawking Zhang 					 uint16_t section_count)
25092d5d2a0SHawking Zhang {
25192d5d2a0SHawking Zhang 	struct cper_hdr *hdr;
25292d5d2a0SHawking Zhang 	uint32_t size = 0;
25392d5d2a0SHawking Zhang 
25492d5d2a0SHawking Zhang 	size += HDR_LEN;
25592d5d2a0SHawking Zhang 	size += (SEC_DESC_LEN * section_count);
25692d5d2a0SHawking Zhang 
25792d5d2a0SHawking Zhang 	switch (type) {
25892d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_RUNTIME:
25992d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
26092d5d2a0SHawking Zhang 		size += (NONSTD_SEC_LEN * section_count);
26192d5d2a0SHawking Zhang 		break;
26292d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_FATAL:
26392d5d2a0SHawking Zhang 		size += (FATAL_SEC_LEN * section_count);
26492d5d2a0SHawking Zhang 		break;
26592d5d2a0SHawking Zhang 	case AMDGPU_CPER_TYPE_BOOT:
26692d5d2a0SHawking Zhang 		size += (BOOT_SEC_LEN * section_count);
26792d5d2a0SHawking Zhang 		break;
26892d5d2a0SHawking Zhang 	default:
26992d5d2a0SHawking Zhang 		dev_err(adev->dev, "Unknown CPER Type!\n");
27092d5d2a0SHawking Zhang 		return NULL;
27192d5d2a0SHawking Zhang 	}
27292d5d2a0SHawking Zhang 
27392d5d2a0SHawking Zhang 	hdr = kzalloc(size, GFP_KERNEL);
27492d5d2a0SHawking Zhang 	if (!hdr)
27592d5d2a0SHawking Zhang 		return NULL;
27692d5d2a0SHawking Zhang 
27792d5d2a0SHawking Zhang 	/* Save this early */
27892d5d2a0SHawking Zhang 	hdr->sec_cnt = section_count;
27992d5d2a0SHawking Zhang 
28092d5d2a0SHawking Zhang 	return hdr;
28192d5d2a0SHawking Zhang }
28292d5d2a0SHawking Zhang 
283ad97840fSHawking Zhang int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
284ad97840fSHawking Zhang 				   struct aca_bank *bank)
285ad97840fSHawking Zhang {
286ad97840fSHawking Zhang 	struct cper_hdr *fatal = NULL;
287ad97840fSHawking Zhang 	struct cper_sec_crashdump_reg_data reg_data = { 0 };
2884058e7cbSXiang Liu 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
289ad97840fSHawking Zhang 	int ret;
290ad97840fSHawking Zhang 
291ad97840fSHawking Zhang 	fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
292ad97840fSHawking Zhang 	if (!fatal) {
293ad97840fSHawking Zhang 		dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
294ad97840fSHawking Zhang 		return -ENOMEM;
295ad97840fSHawking Zhang 	}
296ad97840fSHawking Zhang 
297ad97840fSHawking Zhang 	reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
298ad97840fSHawking Zhang 	reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
299ad97840fSHawking Zhang 	reg_data.addr_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
300ad97840fSHawking Zhang 	reg_data.addr_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
301ad97840fSHawking Zhang 	reg_data.ipid_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
302ad97840fSHawking Zhang 	reg_data.ipid_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
303ad97840fSHawking Zhang 	reg_data.synd_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
304ad97840fSHawking Zhang 	reg_data.synd_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
305ad97840fSHawking Zhang 
306ad97840fSHawking Zhang 	amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
307ad97840fSHawking Zhang 	ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
308ad97840fSHawking Zhang 	if (ret)
309ad97840fSHawking Zhang 		return ret;
310ad97840fSHawking Zhang 
3114058e7cbSXiang Liu 	amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
312677ae51fSXiang Liu 	kfree(fatal);
313ad97840fSHawking Zhang 
314ad97840fSHawking Zhang 	return 0;
315ad97840fSHawking Zhang }
316ad97840fSHawking Zhang 
317f9d35b94SXiang Liu int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
318f9d35b94SXiang Liu {
319f9d35b94SXiang Liu 	struct cper_hdr *bp_threshold = NULL;
320f9d35b94SXiang Liu 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
321f9d35b94SXiang Liu 	int ret;
322f9d35b94SXiang Liu 
323f9d35b94SXiang Liu 	bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);
324f9d35b94SXiang Liu 	if (!bp_threshold) {
325f9d35b94SXiang Liu 		dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
326f9d35b94SXiang Liu 		return -ENOMEM;
327f9d35b94SXiang Liu 	}
328f9d35b94SXiang Liu 
329f9d35b94SXiang Liu 	amdgpu_cper_entry_fill_hdr(adev, bp_threshold, AMDGPU_CPER_TYPE_BP_THRESHOLD, CPER_SEV_NUM);
330f9d35b94SXiang Liu 	ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
331f9d35b94SXiang Liu 	if (ret)
332f9d35b94SXiang Liu 		return ret;
333f9d35b94SXiang Liu 
334f9d35b94SXiang Liu 	amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
335677ae51fSXiang Liu 	kfree(bp_threshold);
336f9d35b94SXiang Liu 
337f9d35b94SXiang Liu 	return 0;
338f9d35b94SXiang Liu }
339f9d35b94SXiang Liu 
340ad97840fSHawking Zhang static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
341ad97840fSHawking Zhang 								enum aca_error_type aca_err_type)
342ad97840fSHawking Zhang {
343ad97840fSHawking Zhang 	switch (aca_err_type) {
344ad97840fSHawking Zhang 	case ACA_ERROR_TYPE_UE:
345ad97840fSHawking Zhang 		return CPER_SEV_FATAL;
346ad97840fSHawking Zhang 	case ACA_ERROR_TYPE_CE:
347ad97840fSHawking Zhang 		return CPER_SEV_NON_FATAL_CORRECTED;
348ad97840fSHawking Zhang 	case ACA_ERROR_TYPE_DEFERRED:
349ad97840fSHawking Zhang 		return CPER_SEV_NON_FATAL_UNCORRECTED;
350ad97840fSHawking Zhang 	default:
351ad97840fSHawking Zhang 		dev_err(adev->dev, "Unknown ACA error type!\n");
352ad97840fSHawking Zhang 		return CPER_SEV_FATAL;
353ad97840fSHawking Zhang 	}
354ad97840fSHawking Zhang }
355ad97840fSHawking Zhang 
356ad97840fSHawking Zhang int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
357ad97840fSHawking Zhang 				    struct aca_banks *banks,
358ad97840fSHawking Zhang 				    uint16_t bank_count)
359ad97840fSHawking Zhang {
360ad97840fSHawking Zhang 	struct cper_hdr *corrected = NULL;
361ad97840fSHawking Zhang 	enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
3624058e7cbSXiang Liu 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
363ad97840fSHawking Zhang 	uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
364ad97840fSHawking Zhang 	struct aca_bank_node *node;
365ad97840fSHawking Zhang 	struct aca_bank *bank;
366ad97840fSHawking Zhang 	uint32_t i = 0;
367ad97840fSHawking Zhang 	int ret;
368ad97840fSHawking Zhang 
369ad97840fSHawking Zhang 	corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
370ad97840fSHawking Zhang 	if (!corrected) {
371ad97840fSHawking Zhang 		dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
372ad97840fSHawking Zhang 		return -ENOMEM;
373ad97840fSHawking Zhang 	}
374ad97840fSHawking Zhang 
375ad97840fSHawking Zhang 	/* Raise severity if any DE is detected in the ACA bank list */
376ad97840fSHawking Zhang 	list_for_each_entry(node, &banks->list, node) {
377ad97840fSHawking Zhang 		bank = &node->bank;
378ad97840fSHawking Zhang 		if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
379ad97840fSHawking Zhang 			sev = CPER_SEV_NON_FATAL_UNCORRECTED;
380ad97840fSHawking Zhang 			break;
381ad97840fSHawking Zhang 		}
382ad97840fSHawking Zhang 	}
383ad97840fSHawking Zhang 
384ad97840fSHawking Zhang 	amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
385ad97840fSHawking Zhang 
386906d2859SXiang Liu 	/* Combine CE and DE in cper record */
387ad97840fSHawking Zhang 	list_for_each_entry(node, &banks->list, node) {
388ad97840fSHawking Zhang 		bank = &node->bank;
389ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_CTL_LO]    = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
390ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_CTL_HI]    = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
391ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
392ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
393ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_ADDR_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
394ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_ADDR_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
395ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_MISC0_LO]  = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
396ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_MISC0_HI]  = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
397ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
398ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
399ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_IPID_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
400ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_IPID_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
401ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_SYND_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
402ad97840fSHawking Zhang 		reg_data[CPER_ACA_REG_SYND_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
403ad97840fSHawking Zhang 
404ad97840fSHawking Zhang 		ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
405ad97840fSHawking Zhang 				amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
406ad97840fSHawking Zhang 				reg_data, CPER_ACA_REG_COUNT);
407ad97840fSHawking Zhang 		if (ret)
408ad97840fSHawking Zhang 			return ret;
409ad97840fSHawking Zhang 	}
410ad97840fSHawking Zhang 
4114058e7cbSXiang Liu 	amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
412677ae51fSXiang Liu 	kfree(corrected);
413ad97840fSHawking Zhang 
414ad97840fSHawking Zhang 	return 0;
415ad97840fSHawking Zhang }
416ad97840fSHawking Zhang 
417a6d9d192STao Zhou static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
418a6d9d192STao Zhou {
419a6d9d192STao Zhou 	struct cper_hdr *chdr;
420a6d9d192STao Zhou 
421a6d9d192STao Zhou 	chdr = (struct cper_hdr *)&(ring->ring[pos]);
422a6d9d192STao Zhou 	return strcmp(chdr->signature, "CPER") ? false : true;
423a6d9d192STao Zhou }
424a6d9d192STao Zhou 
425a6d9d192STao Zhou static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
426a6d9d192STao Zhou {
427a6d9d192STao Zhou 	struct cper_hdr *chdr;
428a6d9d192STao Zhou 	u64 p;
429a6d9d192STao Zhou 	u32 chunk, rec_len = 0;
430a6d9d192STao Zhou 
431a6d9d192STao Zhou 	chdr = (struct cper_hdr *)&(ring->ring[pos]);
432a6d9d192STao Zhou 	chunk = ring->ring_size - (pos << 2);
433a6d9d192STao Zhou 
434a6d9d192STao Zhou 	if (!strcmp(chdr->signature, "CPER")) {
435a6d9d192STao Zhou 		rec_len = chdr->record_length;
436a6d9d192STao Zhou 		goto calc;
437a6d9d192STao Zhou 	}
438a6d9d192STao Zhou 
439a6d9d192STao Zhou 	/* ring buffer is not full, no cper data after ring->wptr */
440a6d9d192STao Zhou 	if (ring->count_dw)
441a6d9d192STao Zhou 		goto calc;
442a6d9d192STao Zhou 
443a6d9d192STao Zhou 	for (p = pos + 1; p <= ring->buf_mask; p++) {
444a6d9d192STao Zhou 		chdr = (struct cper_hdr *)&(ring->ring[p]);
445a6d9d192STao Zhou 		if (!strcmp(chdr->signature, "CPER")) {
446a6d9d192STao Zhou 			rec_len = (p - pos) << 2;
447a6d9d192STao Zhou 			goto calc;
448a6d9d192STao Zhou 		}
449a6d9d192STao Zhou 	}
450a6d9d192STao Zhou 
451a6d9d192STao Zhou calc:
452a6d9d192STao Zhou 	if (!rec_len)
453a6d9d192STao Zhou 		return chunk;
454a6d9d192STao Zhou 	else
455a6d9d192STao Zhou 		return umin(rec_len, chunk);
456a6d9d192STao Zhou }
457a6d9d192STao Zhou 
458*d6f9bbceSXiang Liu void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
459a6d9d192STao Zhou {
460a6d9d192STao Zhou 	u64 pos, wptr_old, rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
461*d6f9bbceSXiang Liu 	int rec_cnt_dw = count >> 2;
462a6d9d192STao Zhou 	u32 chunk, ent_sz;
463a6d9d192STao Zhou 	u8 *s = (u8 *)src;
464a6d9d192STao Zhou 
465a6d9d192STao Zhou 	if (count >= ring->ring_size - 4) {
466a6d9d192STao Zhou 		dev_err(ring->adev->dev,
467a6d9d192STao Zhou 			"CPER data size(%d) is larger than ring size(%d)\n",
468a6d9d192STao Zhou 			count, ring->ring_size - 4);
469a6d9d192STao Zhou 
470a6d9d192STao Zhou 		return;
471a6d9d192STao Zhou 	}
472a6d9d192STao Zhou 
473a6d9d192STao Zhou 	wptr_old = ring->wptr;
474a6d9d192STao Zhou 
4758652920dSTao Zhou 	mutex_lock(&ring->adev->cper.ring_lock);
476a6d9d192STao Zhou 	while (count) {
477a6d9d192STao Zhou 		ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
478a6d9d192STao Zhou 		chunk = umin(ent_sz, count);
479a6d9d192STao Zhou 
480a6d9d192STao Zhou 		memcpy(&ring->ring[ring->wptr], s, chunk);
481a6d9d192STao Zhou 
482a6d9d192STao Zhou 		ring->wptr += (chunk >> 2);
483a6d9d192STao Zhou 		ring->wptr &= ring->ptr_mask;
484a6d9d192STao Zhou 		count -= chunk;
485a6d9d192STao Zhou 		s += chunk;
486a6d9d192STao Zhou 	}
487a6d9d192STao Zhou 
488*d6f9bbceSXiang Liu 	if (ring->count_dw < rec_cnt_dw)
489*d6f9bbceSXiang Liu 		ring->count_dw = 0;
490*d6f9bbceSXiang Liu 
491a6d9d192STao Zhou 	/* the buffer is overflow, adjust rptr */
492a6d9d192STao Zhou 	if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
493a6d9d192STao Zhou 	    ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
494a6d9d192STao Zhou 	    ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {
495a6d9d192STao Zhou 		pos = (ring->wptr + 1) & ring->ptr_mask;
496a6d9d192STao Zhou 
497a6d9d192STao Zhou 		do {
498a6d9d192STao Zhou 			ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
499a6d9d192STao Zhou 
500a6d9d192STao Zhou 			rptr += (ent_sz >> 2);
501a6d9d192STao Zhou 			rptr &= ring->ptr_mask;
502a6d9d192STao Zhou 			*ring->rptr_cpu_addr = rptr;
503a6d9d192STao Zhou 
504a6d9d192STao Zhou 			pos = rptr;
505a6d9d192STao Zhou 		} while (!amdgpu_cper_is_hdr(ring, rptr));
506a6d9d192STao Zhou 	}
507a6d9d192STao Zhou 
508*d6f9bbceSXiang Liu 	if (ring->count_dw >= rec_cnt_dw)
509*d6f9bbceSXiang Liu 		ring->count_dw -= rec_cnt_dw;
510*d6f9bbceSXiang Liu 	mutex_unlock(&ring->adev->cper.ring_lock);
511a6d9d192STao Zhou }
512a6d9d192STao Zhou 
5134d614ce8STao Zhou static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
5144d614ce8STao Zhou {
5154d614ce8STao Zhou 	return *(ring->rptr_cpu_addr);
5164d614ce8STao Zhou }
5174d614ce8STao Zhou 
5184d614ce8STao Zhou static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)
5194d614ce8STao Zhou {
5204d614ce8STao Zhou 	return ring->wptr;
5214d614ce8STao Zhou }
5224d614ce8STao Zhou 
5234d614ce8STao Zhou static const struct amdgpu_ring_funcs cper_ring_funcs = {
5244d614ce8STao Zhou 	.type = AMDGPU_RING_TYPE_CPER,
5254d614ce8STao Zhou 	.align_mask = 0xff,
5264d614ce8STao Zhou 	.support_64bit_ptrs = false,
5274d614ce8STao Zhou 	.get_rptr = amdgpu_cper_ring_get_rptr,
5284d614ce8STao Zhou 	.get_wptr = amdgpu_cper_ring_get_wptr,
5294d614ce8STao Zhou };
5304d614ce8STao Zhou 
5314d614ce8STao Zhou static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
5324d614ce8STao Zhou {
5334d614ce8STao Zhou 	struct amdgpu_ring *ring = &(adev->cper.ring_buf);
5344d614ce8STao Zhou 
5358652920dSTao Zhou 	mutex_init(&adev->cper.ring_lock);
5368652920dSTao Zhou 
5374d614ce8STao Zhou 	ring->adev = NULL;
5384d614ce8STao Zhou 	ring->ring_obj = NULL;
5394d614ce8STao Zhou 	ring->use_doorbell = false;
5404d614ce8STao Zhou 	ring->no_scheduler = true;
5414d614ce8STao Zhou 	ring->funcs = &cper_ring_funcs;
5424d614ce8STao Zhou 
5434d614ce8STao Zhou 	sprintf(ring->name, "cper");
5444d614ce8STao Zhou 	return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,
5454d614ce8STao Zhou 				AMDGPU_RING_PRIO_DEFAULT, NULL);
5464d614ce8STao Zhou }
5474d614ce8STao Zhou 
54892d5d2a0SHawking Zhang int amdgpu_cper_init(struct amdgpu_device *adev)
54992d5d2a0SHawking Zhang {
550ff930483SXiang Liu 	int r;
551ff930483SXiang Liu 
552663a8776SXiang Liu 	if (!amdgpu_aca_is_enabled(adev))
553663a8776SXiang Liu 		return 0;
554663a8776SXiang Liu 
555ff930483SXiang Liu 	r = amdgpu_cper_ring_init(adev);
556ff930483SXiang Liu 	if (r) {
557eaa3feb1SColin Ian King 		dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);
558ff930483SXiang Liu 		return r;
559ff930483SXiang Liu 	}
560ff930483SXiang Liu 
56192d5d2a0SHawking Zhang 	mutex_init(&adev->cper.cper_lock);
56292d5d2a0SHawking Zhang 
56392d5d2a0SHawking Zhang 	adev->cper.enabled = true;
56492d5d2a0SHawking Zhang 	adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
56592d5d2a0SHawking Zhang 
566ff930483SXiang Liu 	return 0;
56792d5d2a0SHawking Zhang }
56892d5d2a0SHawking Zhang 
56992d5d2a0SHawking Zhang int amdgpu_cper_fini(struct amdgpu_device *adev)
57092d5d2a0SHawking Zhang {
571663a8776SXiang Liu 	if (!amdgpu_aca_is_enabled(adev))
572663a8776SXiang Liu 		return 0;
573663a8776SXiang Liu 
57492d5d2a0SHawking Zhang 	adev->cper.enabled = false;
57592d5d2a0SHawking Zhang 
5764d614ce8STao Zhou 	amdgpu_ring_fini(&(adev->cper.ring_buf));
57792d5d2a0SHawking Zhang 	adev->cper.count = 0;
57892d5d2a0SHawking Zhang 	adev->cper.wptr = 0;
57992d5d2a0SHawking Zhang 
58092d5d2a0SHawking Zhang 	return 0;
58192d5d2a0SHawking Zhang }
582