1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include "ras.h" 25 #include "ras_core_status.h" 26 #include "ras_log_ring.h" 27 #include "ras_cper.h" 28 29 static const struct ras_cper_guid MCE = CPER_NOTIFY__MCE; 30 static const struct ras_cper_guid CMC = CPER_NOTIFY__CMC; 31 static const struct ras_cper_guid BOOT = BOOT__TYPE; 32 33 static const struct ras_cper_guid CRASHDUMP = GPU__CRASHDUMP; 34 static const struct ras_cper_guid RUNTIME = GPU__NONSTANDARD_ERROR; 35 36 static void cper_get_timestamp(struct ras_core_context *ras_core, 37 struct ras_cper_timestamp *timestamp, uint64_t utc_second_timestamp) 38 { 39 struct ras_time tm = {0}; 40 41 ras_core_convert_timestamp_to_time(ras_core, utc_second_timestamp, &tm); 42 timestamp->seconds = tm.tm_sec; 43 timestamp->minutes = tm.tm_min; 44 timestamp->hours = tm.tm_hour; 45 timestamp->flag = 0; 46 timestamp->day = tm.tm_mday; 47 timestamp->month = tm.tm_mon; 48 timestamp->year = tm.tm_year % 100; 49 timestamp->century = tm.tm_year / 100; 50 } 51 52 static void fill_section_hdr(struct ras_core_context *ras_core, 53 struct cper_section_hdr *hdr, enum ras_cper_type type, 54 enum ras_cper_severity sev, struct ras_log_info *trace) 55 { 56 struct device_system_info dev_info = {0}; 57 char record_id[32]; 58 59 hdr->signature[0] = 'C'; 60 hdr->signature[1] = 'P'; 61 hdr->signature[2] = 'E'; 62 hdr->signature[3] = 'R'; 63 hdr->revision = CPER_HDR__REV_1; 64 hdr->signature_end = 0xFFFFFFFF; 65 hdr->error_severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); 66 67 hdr->valid_bits.platform_id = 1; 68 hdr->valid_bits.timestamp = 1; 69 70 ras_core_get_device_system_info(ras_core, &dev_info); 71 72 cper_get_timestamp(ras_core, &hdr->timestamp, trace->timestamp); 73 74 snprintf(record_id, sizeof(record_id), "%d:%llX", dev_info.socket_id, 75 RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno)); 76 memcpy(hdr->record_id, record_id, 8); 77 78 snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", 79 dev_info.vendor_id, dev_info.device_id); 80 /* pmfw version should be part of creator_id according to CPER spec */ 81 snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID__AMDGPU); 82 83 switch (type) { 84 case RAS_CPER_TYPE_BOOT: 85 hdr->notify_type = BOOT; 86 break; 87 case RAS_CPER_TYPE_FATAL: 88 case RAS_CPER_TYPE_RMA: 89 hdr->notify_type = MCE; 90 break; 91 case RAS_CPER_TYPE_RUNTIME: 92 if (sev == RAS_CPER_SEV_NON_FATAL_CE) 93 hdr->notify_type = CMC; 94 else 95 hdr->notify_type = MCE; 96 break; 97 default: 98 RAS_DEV_ERR(ras_core->dev, "Unknown CPER Type\n"); 99 break; 100 } 101 } 102 103 static int fill_section_descriptor(struct ras_core_context *ras_core, 104 struct cper_section_descriptor *descriptor, 105 enum ras_cper_severity sev, 106 struct ras_cper_guid sec_type, 107 uint32_t section_offset, 108 uint32_t section_length) 109 { 110 struct device_system_info dev_info = {0}; 111 112 descriptor->revision_minor = CPER_SEC__MINOR_REV_1; 113 descriptor->revision_major = CPER_SEC__MAJOR_REV_22; 114 descriptor->sec_offset = section_offset; 115 descriptor->sec_length = section_length; 116 descriptor->valid_bits.fru_text = 1; 117 descriptor->flag_bits.primary = 1; 118 descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); 119 descriptor->sec_type = sec_type; 120 121 ras_core_get_device_system_info(ras_core, &dev_info); 122 123 snprintf(descriptor->fru_text, 20, "OAM%d", dev_info.socket_id); 124 125 if (sev == RAS_CPER_SEV_RMA) 126 descriptor->flag_bits.exceed_err_threshold = 1; 127 128 if (sev == RAS_CPER_SEV_NON_FATAL_UE) 129 descriptor->flag_bits.latent_err = 1; 130 131 return 0; 132 } 133 134 static int fill_section_fatal(struct ras_core_context *ras_core, 135 struct cper_section_fatal *fatal, struct ras_log_info *trace) 136 { 137 fatal->data.reg_ctx_type = CPER_CTX_TYPE__CRASH; 138 fatal->data.reg_arr_size = sizeof(fatal->data.reg); 139 140 fatal->data.reg.status = trace->aca_reg.regs[RAS_CPER_ACA_REG_STATUS]; 141 fatal->data.reg.addr = trace->aca_reg.regs[RAS_CPER_ACA_REG_ADDR]; 142 fatal->data.reg.ipid = trace->aca_reg.regs[RAS_CPER_ACA_REG_IPID]; 143 fatal->data.reg.synd = trace->aca_reg.regs[RAS_CPER_ACA_REG_SYND]; 144 145 return 0; 146 } 147 148 static int fill_section_runtime(struct ras_core_context *ras_core, 149 struct cper_section_runtime *runtime, struct ras_log_info *trace, 150 enum ras_cper_severity sev) 151 { 152 runtime->hdr.valid_bits.err_info_cnt = 1; 153 runtime->hdr.valid_bits.err_context_cnt = 1; 154 155 runtime->descriptor.error_type = RUNTIME; 156 runtime->descriptor.ms_chk_bits.err_type_valid = 1; 157 if (sev == RAS_CPER_SEV_RMA) { 158 runtime->descriptor.valid_bits.ms_chk = 1; 159 runtime->descriptor.ms_chk_bits.err_type = 1; 160 runtime->descriptor.ms_chk_bits.pcc = 1; 161 } 162 163 runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH; 164 runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump); 165 166 runtime->reg.reg_dump[RAS_CPER_ACA_REG_CTL] = trace->aca_reg.regs[ACA_REG_IDX__CTL]; 167 runtime->reg.reg_dump[RAS_CPER_ACA_REG_STATUS] = trace->aca_reg.regs[ACA_REG_IDX__STATUS]; 168 runtime->reg.reg_dump[RAS_CPER_ACA_REG_ADDR] = trace->aca_reg.regs[ACA_REG_IDX__ADDR]; 169 runtime->reg.reg_dump[RAS_CPER_ACA_REG_MISC0] = trace->aca_reg.regs[ACA_REG_IDX__MISC0]; 170 runtime->reg.reg_dump[RAS_CPER_ACA_REG_CONFIG] = trace->aca_reg.regs[ACA_REG_IDX__CONFG]; 171 runtime->reg.reg_dump[RAS_CPER_ACA_REG_IPID] = trace->aca_reg.regs[ACA_REG_IDX__IPID]; 172 runtime->reg.reg_dump[RAS_CPER_ACA_REG_SYND] = trace->aca_reg.regs[ACA_REG_IDX__SYND]; 173 174 return 0; 175 } 176 177 static int cper_generate_runtime_record(struct ras_core_context *ras_core, 178 struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num, 179 enum ras_cper_severity sev) 180 { 181 struct cper_section_descriptor *descriptor; 182 struct cper_section_runtime *runtime; 183 int i; 184 185 fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, trace_arr[0]); 186 hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num); 187 hdr->sec_cnt = arr_num; 188 for (i = 0; i < arr_num; i++) { 189 descriptor = (struct cper_section_descriptor *)((uint8_t *)hdr + 190 RAS_SEC_DESC_OFFSET(i)); 191 runtime = (struct cper_section_runtime *)((uint8_t *)hdr + 192 RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i)); 193 194 fill_section_descriptor(ras_core, descriptor, sev, RUNTIME, 195 RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i), 196 sizeof(struct cper_section_runtime)); 197 fill_section_runtime(ras_core, runtime, trace_arr[i], sev); 198 } 199 200 return 0; 201 } 202 203 static int cper_generate_fatal_record(struct ras_core_context *ras_core, 204 uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num) 205 { 206 struct ras_cper_fatal_record record = {0}; 207 int i = 0; 208 209 for (i = 0; i < arr_num; i++) { 210 fill_section_hdr(ras_core, &record.hdr, RAS_CPER_TYPE_FATAL, 211 RAS_CPER_SEV_FATAL_UE, trace_arr[i]); 212 record.hdr.record_length = RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN; 213 record.hdr.sec_cnt = 1; 214 215 fill_section_descriptor(ras_core, &record.descriptor, RAS_CPER_SEV_FATAL_UE, 216 CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal), 217 sizeof(struct cper_section_fatal)); 218 219 fill_section_fatal(ras_core, &record.fatal, trace_arr[i]); 220 221 memcpy(buffer + (i * record.hdr.record_length), 222 &record, record.hdr.record_length); 223 } 224 225 return 0; 226 } 227 228 static int cper_get_record_size(enum ras_cper_type type, uint16_t section_count) 229 { 230 int size = 0; 231 232 size += RAS_HDR_LEN; 233 size += (RAS_SEC_DESC_LEN * section_count); 234 235 switch (type) { 236 case RAS_CPER_TYPE_RUNTIME: 237 case RAS_CPER_TYPE_RMA: 238 size += (RAS_NONSTD_SEC_LEN * section_count); 239 break; 240 case RAS_CPER_TYPE_FATAL: 241 size += (RAS_FATAL_SEC_LEN * section_count); 242 size += (RAS_HDR_LEN * (section_count - 1)); 243 break; 244 case RAS_CPER_TYPE_BOOT: 245 size += (RAS_BOOT_SEC_LEN * section_count); 246 break; 247 default: 248 /* should never reach here */ 249 break; 250 } 251 252 return size; 253 } 254 255 static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event event) 256 { 257 switch (event) { 258 case RAS_LOG_EVENT_UE: 259 return RAS_CPER_TYPE_FATAL; 260 case RAS_LOG_EVENT_DE: 261 case RAS_LOG_EVENT_CE: 262 case RAS_LOG_EVENT_POISON_CREATION: 263 case RAS_LOG_EVENT_POISON_CONSUMPTION: 264 return RAS_CPER_TYPE_RUNTIME; 265 case RAS_LOG_EVENT_RMA: 266 return RAS_CPER_TYPE_RMA; 267 default: 268 /* should never reach here */ 269 return RAS_CPER_TYPE_RUNTIME; 270 } 271 } 272 273 int ras_cper_generate_cper(struct ras_core_context *ras_core, 274 struct ras_log_info **trace_list, uint32_t count, 275 uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len) 276 { 277 uint8_t *buffer = buf; 278 uint64_t buf_size = buf_len; 279 int record_size, saved_size = 0; 280 struct cper_section_hdr *hdr; 281 282 /* All the batch traces share the same event */ 283 record_size = cper_get_record_size( 284 cper_ras_log_event_to_cper_type(trace_list[0]->event), count); 285 286 if ((record_size + saved_size) > buf_size) 287 return -ENOMEM; 288 289 hdr = (struct cper_section_hdr *)(buffer + saved_size); 290 291 switch (trace_list[0]->event) { 292 case RAS_LOG_EVENT_RMA: 293 cper_generate_runtime_record(ras_core, hdr, trace_list, count, RAS_CPER_SEV_RMA); 294 break; 295 case RAS_LOG_EVENT_DE: 296 cper_generate_runtime_record(ras_core, 297 hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_UE); 298 break; 299 case RAS_LOG_EVENT_CE: 300 cper_generate_runtime_record(ras_core, 301 hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_CE); 302 break; 303 case RAS_LOG_EVENT_UE: 304 cper_generate_fatal_record(ras_core, buffer + saved_size, trace_list, count); 305 break; 306 default: 307 RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0]->event); 308 break; 309 } 310 311 saved_size += record_size; 312 313 *real_data_len = saved_size; 314 return 0; 315 } 316