192d5d2a0SHawking Zhang // SPDX-License-Identifier: GPL-2.0 292d5d2a0SHawking Zhang /* 392d5d2a0SHawking Zhang * Copyright 2025 Advanced Micro Devices, Inc. 492d5d2a0SHawking Zhang * 592d5d2a0SHawking Zhang * Permission is hereby granted, free of charge, to any person obtaining a 692d5d2a0SHawking Zhang * copy of this software and associated documentation files (the "Software"), 792d5d2a0SHawking Zhang * to deal in the Software without restriction, including without limitation 892d5d2a0SHawking Zhang * the rights to use, copy, modify, merge, publish, distribute, sublicense, 992d5d2a0SHawking Zhang * and/or sell copies of the Software, and to permit persons to whom the 1092d5d2a0SHawking Zhang * Software is furnished to do so, subject to the following conditions: 1192d5d2a0SHawking Zhang * 1292d5d2a0SHawking Zhang * The above copyright notice and this permission notice shall be included in 1392d5d2a0SHawking Zhang * all copies or substantial portions of the Software. 1492d5d2a0SHawking Zhang * 1592d5d2a0SHawking Zhang * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1692d5d2a0SHawking Zhang * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1792d5d2a0SHawking Zhang * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1892d5d2a0SHawking Zhang * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 1992d5d2a0SHawking Zhang * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 2092d5d2a0SHawking Zhang * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 2192d5d2a0SHawking Zhang * OTHER DEALINGS IN THE SOFTWARE. 2292d5d2a0SHawking Zhang * 2392d5d2a0SHawking Zhang */ 24ad97840fSHawking Zhang #include <linux/list.h> 2592d5d2a0SHawking Zhang #include "amdgpu.h" 2692d5d2a0SHawking Zhang 2792d5d2a0SHawking Zhang static const guid_t MCE = CPER_NOTIFY_MCE; 2892d5d2a0SHawking Zhang static const guid_t CMC = CPER_NOTIFY_CMC; 2992d5d2a0SHawking Zhang static const guid_t BOOT = BOOT_TYPE; 3092d5d2a0SHawking Zhang 3192d5d2a0SHawking Zhang static const guid_t CRASHDUMP = AMD_CRASHDUMP; 3292d5d2a0SHawking Zhang static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR; 3392d5d2a0SHawking Zhang 3492d5d2a0SHawking Zhang static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size) 3592d5d2a0SHawking Zhang { 3692d5d2a0SHawking Zhang hdr->record_length += size; 3792d5d2a0SHawking Zhang } 3892d5d2a0SHawking Zhang 39b3060f5bSXiang Liu static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp) 40b3060f5bSXiang Liu { 41b3060f5bSXiang Liu struct tm tm; 42b3060f5bSXiang Liu time64_t now = ktime_get_real_seconds(); 43b3060f5bSXiang Liu 44b3060f5bSXiang Liu time64_to_tm(now, 0, &tm); 45b3060f5bSXiang Liu timestamp->seconds = tm.tm_sec; 46b3060f5bSXiang Liu timestamp->minutes = tm.tm_min; 47b3060f5bSXiang Liu timestamp->hours = tm.tm_hour; 48b3060f5bSXiang Liu timestamp->flag = 0; 49b3060f5bSXiang Liu timestamp->day = tm.tm_mday; 50b3060f5bSXiang Liu timestamp->month = 1 + tm.tm_mon; 51b3060f5bSXiang Liu timestamp->year = (1900 + tm.tm_year) % 100; 52b3060f5bSXiang Liu timestamp->century = (1900 + tm.tm_year) / 100; 53b3060f5bSXiang Liu } 54b3060f5bSXiang Liu 5592d5d2a0SHawking Zhang void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, 5692d5d2a0SHawking Zhang struct cper_hdr *hdr, 5792d5d2a0SHawking Zhang enum amdgpu_cper_type type, 5892d5d2a0SHawking Zhang enum cper_error_severity sev) 5992d5d2a0SHawking Zhang { 60148084bbSXiang Liu char record_id[16]; 61148084bbSXiang Liu 6292d5d2a0SHawking Zhang hdr->signature[0] = 'C'; 6392d5d2a0SHawking Zhang hdr->signature[1] = 'P'; 6492d5d2a0SHawking Zhang hdr->signature[2] = 'E'; 6592d5d2a0SHawking Zhang hdr->signature[3] = 'R'; 6692d5d2a0SHawking Zhang hdr->revision = CPER_HDR_REV_1; 6792d5d2a0SHawking Zhang hdr->signature_end = 0xFFFFFFFF; 6892d5d2a0SHawking Zhang hdr->error_severity = sev; 6992d5d2a0SHawking Zhang 7092d5d2a0SHawking Zhang hdr->valid_bits.platform_id = 1; 7192d5d2a0SHawking Zhang hdr->valid_bits.partition_id = 1; 7292d5d2a0SHawking Zhang hdr->valid_bits.timestamp = 1; 73b3060f5bSXiang Liu 74b3060f5bSXiang Liu amdgpu_cper_get_timestamp(&hdr->timestamp); 7592d5d2a0SHawking Zhang 76148084bbSXiang Liu snprintf(record_id, 9, "%d:%X", 77148084bbSXiang Liu (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? 78148084bbSXiang Liu adev->smuio.funcs->get_socket_id(adev) : 79148084bbSXiang Liu 0, 80148084bbSXiang Liu atomic_inc_return(&adev->cper.unique_id)); 81148084bbSXiang Liu memcpy(hdr->record_id, record_id, 8); 82148084bbSXiang Liu 8392d5d2a0SHawking Zhang snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", 8492d5d2a0SHawking Zhang adev->pdev->vendor, adev->pdev->device); 8592d5d2a0SHawking Zhang /* pmfw version should be part of creator_id according to CPER spec */ 8692d5d2a0SHawking Zhang snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU); 8792d5d2a0SHawking Zhang 8892d5d2a0SHawking Zhang switch (type) { 8992d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_BOOT: 9092d5d2a0SHawking Zhang hdr->notify_type = BOOT; 9192d5d2a0SHawking Zhang break; 9292d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_FATAL: 9392d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_BP_THRESHOLD: 9492d5d2a0SHawking Zhang hdr->notify_type = MCE; 9592d5d2a0SHawking Zhang break; 9692d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_RUNTIME: 9792d5d2a0SHawking Zhang if (sev == CPER_SEV_NON_FATAL_CORRECTED) 9892d5d2a0SHawking Zhang hdr->notify_type = CMC; 9992d5d2a0SHawking Zhang else 10092d5d2a0SHawking Zhang hdr->notify_type = MCE; 10192d5d2a0SHawking Zhang break; 10292d5d2a0SHawking Zhang default: 10392d5d2a0SHawking Zhang dev_err(adev->dev, "Unknown CPER Type\n"); 10492d5d2a0SHawking Zhang break; 10592d5d2a0SHawking Zhang } 10692d5d2a0SHawking Zhang 10792d5d2a0SHawking Zhang __inc_entry_length(hdr, HDR_LEN); 10892d5d2a0SHawking Zhang } 10992d5d2a0SHawking Zhang 11092d5d2a0SHawking Zhang static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev, 11192d5d2a0SHawking Zhang struct cper_sec_desc *section_desc, 11292d5d2a0SHawking Zhang bool bp_threshold, 11392d5d2a0SHawking Zhang bool poison, 11492d5d2a0SHawking Zhang enum cper_error_severity sev, 11592d5d2a0SHawking Zhang guid_t sec_type, 11692d5d2a0SHawking Zhang uint32_t section_length, 11792d5d2a0SHawking Zhang uint32_t section_offset) 11892d5d2a0SHawking Zhang { 11992d5d2a0SHawking Zhang section_desc->revision_minor = CPER_SEC_MINOR_REV_1; 12092d5d2a0SHawking Zhang section_desc->revision_major = CPER_SEC_MAJOR_REV_22; 12192d5d2a0SHawking Zhang section_desc->sec_offset = section_offset; 12292d5d2a0SHawking Zhang section_desc->sec_length = section_length; 12392d5d2a0SHawking Zhang section_desc->valid_bits.fru_text = 1; 12492d5d2a0SHawking Zhang section_desc->flag_bits.primary = 1; 12592d5d2a0SHawking Zhang section_desc->severity = sev; 12692d5d2a0SHawking Zhang section_desc->sec_type = sec_type; 12792d5d2a0SHawking Zhang 12892d5d2a0SHawking Zhang snprintf(section_desc->fru_text, 20, "OAM%d", 129148084bbSXiang Liu (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? 130148084bbSXiang Liu adev->smuio.funcs->get_socket_id(adev) : 131148084bbSXiang Liu 0); 13292d5d2a0SHawking Zhang 13392d5d2a0SHawking Zhang if (bp_threshold) 13492d5d2a0SHawking Zhang section_desc->flag_bits.exceed_err_threshold = 1; 13592d5d2a0SHawking Zhang if (poison) 13692d5d2a0SHawking Zhang section_desc->flag_bits.latent_err = 1; 13792d5d2a0SHawking Zhang 13892d5d2a0SHawking Zhang return 0; 13992d5d2a0SHawking Zhang } 14092d5d2a0SHawking Zhang 14192d5d2a0SHawking Zhang int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev, 14292d5d2a0SHawking Zhang struct cper_hdr *hdr, 14392d5d2a0SHawking Zhang uint32_t idx, 14492d5d2a0SHawking Zhang struct cper_sec_crashdump_reg_data reg_data) 14592d5d2a0SHawking Zhang { 14692d5d2a0SHawking Zhang struct cper_sec_desc *section_desc; 14792d5d2a0SHawking Zhang struct cper_sec_crashdump_fatal *section; 14892d5d2a0SHawking Zhang 14992d5d2a0SHawking Zhang section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 15092d5d2a0SHawking Zhang section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr + 15192d5d2a0SHawking Zhang FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); 15292d5d2a0SHawking Zhang 15392d5d2a0SHawking Zhang amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false, 15492d5d2a0SHawking Zhang CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN, 15592d5d2a0SHawking Zhang FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); 15692d5d2a0SHawking Zhang 15792d5d2a0SHawking Zhang section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH; 15892d5d2a0SHawking Zhang section->body.reg_arr_size = sizeof(reg_data); 15992d5d2a0SHawking Zhang section->body.data = reg_data; 16092d5d2a0SHawking Zhang 16192d5d2a0SHawking Zhang __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN); 16292d5d2a0SHawking Zhang 16392d5d2a0SHawking Zhang return 0; 16492d5d2a0SHawking Zhang } 16592d5d2a0SHawking Zhang 16692d5d2a0SHawking Zhang int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev, 16792d5d2a0SHawking Zhang struct cper_hdr *hdr, 16892d5d2a0SHawking Zhang uint32_t idx, 16992d5d2a0SHawking Zhang enum cper_error_severity sev, 17092d5d2a0SHawking Zhang uint32_t *reg_dump, 17192d5d2a0SHawking Zhang uint32_t reg_count) 17292d5d2a0SHawking Zhang { 17392d5d2a0SHawking Zhang struct cper_sec_desc *section_desc; 17492d5d2a0SHawking Zhang struct cper_sec_nonstd_err *section; 17592d5d2a0SHawking Zhang bool poison; 17692d5d2a0SHawking Zhang 17792d5d2a0SHawking Zhang poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true; 17892d5d2a0SHawking Zhang section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 17992d5d2a0SHawking Zhang section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + 18092d5d2a0SHawking Zhang NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 18192d5d2a0SHawking Zhang 18292d5d2a0SHawking Zhang amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison, 18392d5d2a0SHawking Zhang sev, RUNTIME, NONSTD_SEC_LEN, 18492d5d2a0SHawking Zhang NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 18592d5d2a0SHawking Zhang 186a6d9d192STao Zhou reg_count = umin(reg_count, CPER_ACA_REG_COUNT); 18792d5d2a0SHawking Zhang 18892d5d2a0SHawking Zhang section->hdr.valid_bits.err_info_cnt = 1; 18992d5d2a0SHawking Zhang section->hdr.valid_bits.err_context_cnt = 1; 19092d5d2a0SHawking Zhang 19192d5d2a0SHawking Zhang section->info.error_type = RUNTIME; 19292d5d2a0SHawking Zhang section->info.ms_chk_bits.err_type_valid = 1; 19392d5d2a0SHawking Zhang section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; 19492d5d2a0SHawking Zhang section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); 19592d5d2a0SHawking Zhang 19692d5d2a0SHawking Zhang memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t)); 19792d5d2a0SHawking Zhang 19892d5d2a0SHawking Zhang __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); 19992d5d2a0SHawking Zhang 20092d5d2a0SHawking Zhang return 0; 20192d5d2a0SHawking Zhang } 20292d5d2a0SHawking Zhang 20392d5d2a0SHawking Zhang int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev, 20492d5d2a0SHawking Zhang struct cper_hdr *hdr, 20592d5d2a0SHawking Zhang uint32_t idx) 20692d5d2a0SHawking Zhang { 20792d5d2a0SHawking Zhang struct cper_sec_desc *section_desc; 20892d5d2a0SHawking Zhang struct cper_sec_nonstd_err *section; 20992d5d2a0SHawking Zhang 21092d5d2a0SHawking Zhang section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 21192d5d2a0SHawking Zhang section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + 21292d5d2a0SHawking Zhang NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 21392d5d2a0SHawking Zhang 21492d5d2a0SHawking Zhang amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false, 215f9d35b94SXiang Liu CPER_SEV_NUM, RUNTIME, NONSTD_SEC_LEN, 21692d5d2a0SHawking Zhang NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 21792d5d2a0SHawking Zhang 21892d5d2a0SHawking Zhang section->hdr.valid_bits.err_info_cnt = 1; 21992d5d2a0SHawking Zhang section->hdr.valid_bits.err_context_cnt = 1; 22092d5d2a0SHawking Zhang 22192d5d2a0SHawking Zhang section->info.error_type = RUNTIME; 22292d5d2a0SHawking Zhang section->info.ms_chk_bits.err_type_valid = 1; 22392d5d2a0SHawking Zhang section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; 22492d5d2a0SHawking Zhang section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); 22592d5d2a0SHawking Zhang 22692d5d2a0SHawking Zhang /* Hardcoded Reg dump for bad page threshold CPER */ 22792d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1; 22892d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0; 22992d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137; 23092d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000; 23192d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0; 23292d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0; 23392d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0; 23492d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0; 23592d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2; 23692d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff; 23792d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0; 23892d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96; 23992d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0; 24092d5d2a0SHawking Zhang section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0; 24192d5d2a0SHawking Zhang 24292d5d2a0SHawking Zhang __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); 24392d5d2a0SHawking Zhang 24492d5d2a0SHawking Zhang return 0; 24592d5d2a0SHawking Zhang } 24692d5d2a0SHawking Zhang 24792d5d2a0SHawking Zhang struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, 24892d5d2a0SHawking Zhang enum amdgpu_cper_type type, 24992d5d2a0SHawking Zhang uint16_t section_count) 25092d5d2a0SHawking Zhang { 25192d5d2a0SHawking Zhang struct cper_hdr *hdr; 25292d5d2a0SHawking Zhang uint32_t size = 0; 25392d5d2a0SHawking Zhang 25492d5d2a0SHawking Zhang size += HDR_LEN; 25592d5d2a0SHawking Zhang size += (SEC_DESC_LEN * section_count); 25692d5d2a0SHawking Zhang 25792d5d2a0SHawking Zhang switch (type) { 25892d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_RUNTIME: 25992d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_BP_THRESHOLD: 26092d5d2a0SHawking Zhang size += (NONSTD_SEC_LEN * section_count); 26192d5d2a0SHawking Zhang break; 26292d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_FATAL: 26392d5d2a0SHawking Zhang size += (FATAL_SEC_LEN * section_count); 26492d5d2a0SHawking Zhang break; 26592d5d2a0SHawking Zhang case AMDGPU_CPER_TYPE_BOOT: 26692d5d2a0SHawking Zhang size += (BOOT_SEC_LEN * section_count); 26792d5d2a0SHawking Zhang break; 26892d5d2a0SHawking Zhang default: 26992d5d2a0SHawking Zhang dev_err(adev->dev, "Unknown CPER Type!\n"); 27092d5d2a0SHawking Zhang return NULL; 27192d5d2a0SHawking Zhang } 27292d5d2a0SHawking Zhang 27392d5d2a0SHawking Zhang hdr = kzalloc(size, GFP_KERNEL); 27492d5d2a0SHawking Zhang if (!hdr) 27592d5d2a0SHawking Zhang return NULL; 27692d5d2a0SHawking Zhang 27792d5d2a0SHawking Zhang /* Save this early */ 27892d5d2a0SHawking Zhang hdr->sec_cnt = section_count; 27992d5d2a0SHawking Zhang 28092d5d2a0SHawking Zhang return hdr; 28192d5d2a0SHawking Zhang } 28292d5d2a0SHawking Zhang 283ad97840fSHawking Zhang int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, 284ad97840fSHawking Zhang struct aca_bank *bank) 285ad97840fSHawking Zhang { 286ad97840fSHawking Zhang struct cper_hdr *fatal = NULL; 287ad97840fSHawking Zhang struct cper_sec_crashdump_reg_data reg_data = { 0 }; 2884058e7cbSXiang Liu struct amdgpu_ring *ring = &adev->cper.ring_buf; 289ad97840fSHawking Zhang int ret; 290ad97840fSHawking Zhang 291ad97840fSHawking Zhang fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1); 292ad97840fSHawking Zhang if (!fatal) { 293ad97840fSHawking Zhang dev_err(adev->dev, "fail to alloc cper entry for ue record\n"); 294ad97840fSHawking Zhang return -ENOMEM; 295ad97840fSHawking Zhang } 296ad97840fSHawking Zhang 297ad97840fSHawking Zhang reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 298ad97840fSHawking Zhang reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 299ad97840fSHawking Zhang reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 300ad97840fSHawking Zhang reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 301ad97840fSHawking Zhang reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); 302ad97840fSHawking Zhang reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); 303ad97840fSHawking Zhang reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); 304ad97840fSHawking Zhang reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); 305ad97840fSHawking Zhang 306ad97840fSHawking Zhang amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL); 307ad97840fSHawking Zhang ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data); 308ad97840fSHawking Zhang if (ret) 309ad97840fSHawking Zhang return ret; 310ad97840fSHawking Zhang 3114058e7cbSXiang Liu amdgpu_cper_ring_write(ring, fatal, fatal->record_length); 312677ae51fSXiang Liu kfree(fatal); 313ad97840fSHawking Zhang 314ad97840fSHawking Zhang return 0; 315ad97840fSHawking Zhang } 316ad97840fSHawking Zhang 317f9d35b94SXiang Liu int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev) 318f9d35b94SXiang Liu { 319f9d35b94SXiang Liu struct cper_hdr *bp_threshold = NULL; 320f9d35b94SXiang Liu struct amdgpu_ring *ring = &adev->cper.ring_buf; 321f9d35b94SXiang Liu int ret; 322f9d35b94SXiang Liu 323f9d35b94SXiang Liu bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1); 324f9d35b94SXiang Liu if (!bp_threshold) { 325f9d35b94SXiang Liu dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n"); 326f9d35b94SXiang Liu return -ENOMEM; 327f9d35b94SXiang Liu } 328f9d35b94SXiang Liu 329f9d35b94SXiang Liu amdgpu_cper_entry_fill_hdr(adev, bp_threshold, AMDGPU_CPER_TYPE_BP_THRESHOLD, CPER_SEV_NUM); 330f9d35b94SXiang Liu ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0); 331f9d35b94SXiang Liu if (ret) 332f9d35b94SXiang Liu return ret; 333f9d35b94SXiang Liu 334f9d35b94SXiang Liu amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length); 335677ae51fSXiang Liu kfree(bp_threshold); 336f9d35b94SXiang Liu 337f9d35b94SXiang Liu return 0; 338f9d35b94SXiang Liu } 339f9d35b94SXiang Liu 340ad97840fSHawking Zhang static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev, 341ad97840fSHawking Zhang enum aca_error_type aca_err_type) 342ad97840fSHawking Zhang { 343ad97840fSHawking Zhang switch (aca_err_type) { 344ad97840fSHawking Zhang case ACA_ERROR_TYPE_UE: 345ad97840fSHawking Zhang return CPER_SEV_FATAL; 346ad97840fSHawking Zhang case ACA_ERROR_TYPE_CE: 347ad97840fSHawking Zhang return CPER_SEV_NON_FATAL_CORRECTED; 348ad97840fSHawking Zhang case ACA_ERROR_TYPE_DEFERRED: 349ad97840fSHawking Zhang return CPER_SEV_NON_FATAL_UNCORRECTED; 350ad97840fSHawking Zhang default: 351ad97840fSHawking Zhang dev_err(adev->dev, "Unknown ACA error type!\n"); 352ad97840fSHawking Zhang return CPER_SEV_FATAL; 353ad97840fSHawking Zhang } 354ad97840fSHawking Zhang } 355ad97840fSHawking Zhang 356ad97840fSHawking Zhang int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, 357ad97840fSHawking Zhang struct aca_banks *banks, 358ad97840fSHawking Zhang uint16_t bank_count) 359ad97840fSHawking Zhang { 360ad97840fSHawking Zhang struct cper_hdr *corrected = NULL; 361ad97840fSHawking Zhang enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED; 3624058e7cbSXiang Liu struct amdgpu_ring *ring = &adev->cper.ring_buf; 363ad97840fSHawking Zhang uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 }; 364ad97840fSHawking Zhang struct aca_bank_node *node; 365ad97840fSHawking Zhang struct aca_bank *bank; 366ad97840fSHawking Zhang uint32_t i = 0; 367ad97840fSHawking Zhang int ret; 368ad97840fSHawking Zhang 369ad97840fSHawking Zhang corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count); 370ad97840fSHawking Zhang if (!corrected) { 371ad97840fSHawking Zhang dev_err(adev->dev, "fail to allocate cper entry for ce records\n"); 372ad97840fSHawking Zhang return -ENOMEM; 373ad97840fSHawking Zhang } 374ad97840fSHawking Zhang 375ad97840fSHawking Zhang /* Raise severity if any DE is detected in the ACA bank list */ 376ad97840fSHawking Zhang list_for_each_entry(node, &banks->list, node) { 377ad97840fSHawking Zhang bank = &node->bank; 378ad97840fSHawking Zhang if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { 379ad97840fSHawking Zhang sev = CPER_SEV_NON_FATAL_UNCORRECTED; 380ad97840fSHawking Zhang break; 381ad97840fSHawking Zhang } 382ad97840fSHawking Zhang } 383ad97840fSHawking Zhang 384ad97840fSHawking Zhang amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev); 385ad97840fSHawking Zhang 386906d2859SXiang Liu /* Combine CE and DE in cper record */ 387ad97840fSHawking Zhang list_for_each_entry(node, &banks->list, node) { 388ad97840fSHawking Zhang bank = &node->bank; 389ad97840fSHawking Zhang reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]); 390ad97840fSHawking Zhang reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]); 391ad97840fSHawking Zhang reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 392ad97840fSHawking Zhang reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 393ad97840fSHawking Zhang reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 394ad97840fSHawking Zhang reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 395ad97840fSHawking Zhang reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]); 396ad97840fSHawking Zhang reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]); 397ad97840fSHawking Zhang reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); 398ad97840fSHawking Zhang reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); 399ad97840fSHawking Zhang reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); 400ad97840fSHawking Zhang reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); 401ad97840fSHawking Zhang reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); 402ad97840fSHawking Zhang reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); 403ad97840fSHawking Zhang 404ad97840fSHawking Zhang ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++, 405ad97840fSHawking Zhang amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type), 406ad97840fSHawking Zhang reg_data, CPER_ACA_REG_COUNT); 407ad97840fSHawking Zhang if (ret) 408ad97840fSHawking Zhang return ret; 409ad97840fSHawking Zhang } 410ad97840fSHawking Zhang 4114058e7cbSXiang Liu amdgpu_cper_ring_write(ring, corrected, corrected->record_length); 412677ae51fSXiang Liu kfree(corrected); 413ad97840fSHawking Zhang 414ad97840fSHawking Zhang return 0; 415ad97840fSHawking Zhang } 416ad97840fSHawking Zhang 417a6d9d192STao Zhou static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos) 418a6d9d192STao Zhou { 419a6d9d192STao Zhou struct cper_hdr *chdr; 420a6d9d192STao Zhou 421a6d9d192STao Zhou chdr = (struct cper_hdr *)&(ring->ring[pos]); 422a6d9d192STao Zhou return strcmp(chdr->signature, "CPER") ? false : true; 423a6d9d192STao Zhou } 424a6d9d192STao Zhou 425a6d9d192STao Zhou static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos) 426a6d9d192STao Zhou { 427a6d9d192STao Zhou struct cper_hdr *chdr; 428a6d9d192STao Zhou u64 p; 429a6d9d192STao Zhou u32 chunk, rec_len = 0; 430a6d9d192STao Zhou 431a6d9d192STao Zhou chdr = (struct cper_hdr *)&(ring->ring[pos]); 432a6d9d192STao Zhou chunk = ring->ring_size - (pos << 2); 433a6d9d192STao Zhou 434a6d9d192STao Zhou if (!strcmp(chdr->signature, "CPER")) { 435a6d9d192STao Zhou rec_len = chdr->record_length; 436a6d9d192STao Zhou goto calc; 437a6d9d192STao Zhou } 438a6d9d192STao Zhou 439a6d9d192STao Zhou /* ring buffer is not full, no cper data after ring->wptr */ 440a6d9d192STao Zhou if (ring->count_dw) 441a6d9d192STao Zhou goto calc; 442a6d9d192STao Zhou 443a6d9d192STao Zhou for (p = pos + 1; p <= ring->buf_mask; p++) { 444a6d9d192STao Zhou chdr = (struct cper_hdr *)&(ring->ring[p]); 445a6d9d192STao Zhou if (!strcmp(chdr->signature, "CPER")) { 446a6d9d192STao Zhou rec_len = (p - pos) << 2; 447a6d9d192STao Zhou goto calc; 448a6d9d192STao Zhou } 449a6d9d192STao Zhou } 450a6d9d192STao Zhou 451a6d9d192STao Zhou calc: 452a6d9d192STao Zhou if (!rec_len) 453a6d9d192STao Zhou return chunk; 454a6d9d192STao Zhou else 455a6d9d192STao Zhou return umin(rec_len, chunk); 456a6d9d192STao Zhou } 457a6d9d192STao Zhou 458*d6f9bbceSXiang Liu void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count) 459a6d9d192STao Zhou { 460a6d9d192STao Zhou u64 pos, wptr_old, rptr = *ring->rptr_cpu_addr & ring->ptr_mask; 461*d6f9bbceSXiang Liu int rec_cnt_dw = count >> 2; 462a6d9d192STao Zhou u32 chunk, ent_sz; 463a6d9d192STao Zhou u8 *s = (u8 *)src; 464a6d9d192STao Zhou 465a6d9d192STao Zhou if (count >= ring->ring_size - 4) { 466a6d9d192STao Zhou dev_err(ring->adev->dev, 467a6d9d192STao Zhou "CPER data size(%d) is larger than ring size(%d)\n", 468a6d9d192STao Zhou count, ring->ring_size - 4); 469a6d9d192STao Zhou 470a6d9d192STao Zhou return; 471a6d9d192STao Zhou } 472a6d9d192STao Zhou 473a6d9d192STao Zhou wptr_old = ring->wptr; 474a6d9d192STao Zhou 4758652920dSTao Zhou mutex_lock(&ring->adev->cper.ring_lock); 476a6d9d192STao Zhou while (count) { 477a6d9d192STao Zhou ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr); 478a6d9d192STao Zhou chunk = umin(ent_sz, count); 479a6d9d192STao Zhou 480a6d9d192STao Zhou memcpy(&ring->ring[ring->wptr], s, chunk); 481a6d9d192STao Zhou 482a6d9d192STao Zhou ring->wptr += (chunk >> 2); 483a6d9d192STao Zhou ring->wptr &= ring->ptr_mask; 484a6d9d192STao Zhou count -= chunk; 485a6d9d192STao Zhou s += chunk; 486a6d9d192STao Zhou } 487a6d9d192STao Zhou 488*d6f9bbceSXiang Liu if (ring->count_dw < rec_cnt_dw) 489*d6f9bbceSXiang Liu ring->count_dw = 0; 490*d6f9bbceSXiang Liu 491a6d9d192STao Zhou /* the buffer is overflow, adjust rptr */ 492a6d9d192STao Zhou if (((wptr_old < rptr) && (rptr <= ring->wptr)) || 493a6d9d192STao Zhou ((ring->wptr < wptr_old) && (wptr_old < rptr)) || 494a6d9d192STao Zhou ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) { 495a6d9d192STao Zhou pos = (ring->wptr + 1) & ring->ptr_mask; 496a6d9d192STao Zhou 497a6d9d192STao Zhou do { 498a6d9d192STao Zhou ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos); 499a6d9d192STao Zhou 500a6d9d192STao Zhou rptr += (ent_sz >> 2); 501a6d9d192STao Zhou rptr &= ring->ptr_mask; 502a6d9d192STao Zhou *ring->rptr_cpu_addr = rptr; 503a6d9d192STao Zhou 504a6d9d192STao Zhou pos = rptr; 505a6d9d192STao Zhou } while (!amdgpu_cper_is_hdr(ring, rptr)); 506a6d9d192STao Zhou } 507a6d9d192STao Zhou 508*d6f9bbceSXiang Liu if (ring->count_dw >= rec_cnt_dw) 509*d6f9bbceSXiang Liu ring->count_dw -= rec_cnt_dw; 510*d6f9bbceSXiang Liu mutex_unlock(&ring->adev->cper.ring_lock); 511a6d9d192STao Zhou } 512a6d9d192STao Zhou 5134d614ce8STao Zhou static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring) 5144d614ce8STao Zhou { 5154d614ce8STao Zhou return *(ring->rptr_cpu_addr); 5164d614ce8STao Zhou } 5174d614ce8STao Zhou 5184d614ce8STao Zhou static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring) 5194d614ce8STao Zhou { 5204d614ce8STao Zhou return ring->wptr; 5214d614ce8STao Zhou } 5224d614ce8STao Zhou 5234d614ce8STao Zhou static const struct amdgpu_ring_funcs cper_ring_funcs = { 5244d614ce8STao Zhou .type = AMDGPU_RING_TYPE_CPER, 5254d614ce8STao Zhou .align_mask = 0xff, 5264d614ce8STao Zhou .support_64bit_ptrs = false, 5274d614ce8STao Zhou .get_rptr = amdgpu_cper_ring_get_rptr, 5284d614ce8STao Zhou .get_wptr = amdgpu_cper_ring_get_wptr, 5294d614ce8STao Zhou }; 5304d614ce8STao Zhou 5314d614ce8STao Zhou static int amdgpu_cper_ring_init(struct amdgpu_device *adev) 5324d614ce8STao Zhou { 5334d614ce8STao Zhou struct amdgpu_ring *ring = &(adev->cper.ring_buf); 5344d614ce8STao Zhou 5358652920dSTao Zhou mutex_init(&adev->cper.ring_lock); 5368652920dSTao Zhou 5374d614ce8STao Zhou ring->adev = NULL; 5384d614ce8STao Zhou ring->ring_obj = NULL; 5394d614ce8STao Zhou ring->use_doorbell = false; 5404d614ce8STao Zhou ring->no_scheduler = true; 5414d614ce8STao Zhou ring->funcs = &cper_ring_funcs; 5424d614ce8STao Zhou 5434d614ce8STao Zhou sprintf(ring->name, "cper"); 5444d614ce8STao Zhou return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0, 5454d614ce8STao Zhou AMDGPU_RING_PRIO_DEFAULT, NULL); 5464d614ce8STao Zhou } 5474d614ce8STao Zhou 54892d5d2a0SHawking Zhang int amdgpu_cper_init(struct amdgpu_device *adev) 54992d5d2a0SHawking Zhang { 550ff930483SXiang Liu int r; 551ff930483SXiang Liu 552663a8776SXiang Liu if (!amdgpu_aca_is_enabled(adev)) 553663a8776SXiang Liu return 0; 554663a8776SXiang Liu 555ff930483SXiang Liu r = amdgpu_cper_ring_init(adev); 556ff930483SXiang Liu if (r) { 557eaa3feb1SColin Ian King dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r); 558ff930483SXiang Liu return r; 559ff930483SXiang Liu } 560ff930483SXiang Liu 56192d5d2a0SHawking Zhang mutex_init(&adev->cper.cper_lock); 56292d5d2a0SHawking Zhang 56392d5d2a0SHawking Zhang adev->cper.enabled = true; 56492d5d2a0SHawking Zhang adev->cper.max_count = CPER_MAX_ALLOWED_COUNT; 56592d5d2a0SHawking Zhang 566ff930483SXiang Liu return 0; 56792d5d2a0SHawking Zhang } 56892d5d2a0SHawking Zhang 56992d5d2a0SHawking Zhang int amdgpu_cper_fini(struct amdgpu_device *adev) 57092d5d2a0SHawking Zhang { 571663a8776SXiang Liu if (!amdgpu_aca_is_enabled(adev)) 572663a8776SXiang Liu return 0; 573663a8776SXiang Liu 57492d5d2a0SHawking Zhang adev->cper.enabled = false; 57592d5d2a0SHawking Zhang 5764d614ce8STao Zhou amdgpu_ring_fini(&(adev->cper.ring_buf)); 57792d5d2a0SHawking Zhang adev->cper.count = 0; 57892d5d2a0SHawking Zhang adev->cper.wptr = 0; 57992d5d2a0SHawking Zhang 58092d5d2a0SHawking Zhang return 0; 58192d5d2a0SHawking Zhang } 582