xref: /linux/drivers/gpu/drm/amd/ras/rascore/ras_cper.c (revision 24f171c7e145f43b9f187578e89b0982ce87e54c)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include "ras.h"
25 #include "ras_core_status.h"
26 #include "ras_log_ring.h"
27 #include "ras_cper.h"
28 
29 static const struct ras_cper_guid MCE	= CPER_NOTIFY__MCE;
30 static const struct ras_cper_guid CMC	= CPER_NOTIFY__CMC;
31 static const struct ras_cper_guid BOOT	= BOOT__TYPE;
32 
33 static const struct ras_cper_guid CRASHDUMP = GPU__CRASHDUMP;
34 static const struct ras_cper_guid RUNTIME = GPU__NONSTANDARD_ERROR;
35 
36 static void cper_get_timestamp(struct ras_core_context *ras_core,
37 		struct ras_cper_timestamp *timestamp, uint64_t utc_second_timestamp)
38 {
39 	struct ras_time tm = {0};
40 
41 	ras_core_convert_timestamp_to_time(ras_core, utc_second_timestamp, &tm);
42 	timestamp->seconds = tm.tm_sec;
43 	timestamp->minutes = tm.tm_min;
44 	timestamp->hours = tm.tm_hour;
45 	timestamp->flag = 0;
46 	timestamp->day = tm.tm_mday;
47 	timestamp->month = tm.tm_mon;
48 	timestamp->year = tm.tm_year % 100;
49 	timestamp->century = tm.tm_year / 100;
50 }
51 
52 static void fill_section_hdr(struct ras_core_context *ras_core,
53 				struct cper_section_hdr *hdr, enum ras_cper_type type,
54 				enum ras_cper_severity sev, struct ras_log_info *trace)
55 {
56 	struct device_system_info dev_info = {0};
57 	char record_id[32];
58 
59 	hdr->signature[0]		= 'C';
60 	hdr->signature[1]		= 'P';
61 	hdr->signature[2]		= 'E';
62 	hdr->signature[3]		= 'R';
63 	hdr->revision			= CPER_HDR__REV_1;
64 	hdr->signature_end		= 0xFFFFFFFF;
65 	hdr->error_severity		= (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev);
66 
67 	hdr->valid_bits.platform_id	= 1;
68 	hdr->valid_bits.timestamp	= 1;
69 
70 	ras_core_get_device_system_info(ras_core, &dev_info);
71 
72 	cper_get_timestamp(ras_core, &hdr->timestamp, trace->timestamp);
73 
74 	snprintf(record_id, sizeof(record_id), "%d:%llX", dev_info.socket_id,
75 		    RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno));
76 	memcpy(hdr->record_id, record_id, 8);
77 
78 	snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
79 		dev_info.vendor_id, dev_info.device_id);
80 	/* pmfw version should be part of creator_id according to CPER spec */
81 	snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID__AMDGPU);
82 
83 	switch (type) {
84 	case RAS_CPER_TYPE_BOOT:
85 		hdr->notify_type = BOOT;
86 		break;
87 	case RAS_CPER_TYPE_FATAL:
88 	case RAS_CPER_TYPE_RMA:
89 		hdr->notify_type = MCE;
90 		break;
91 	case RAS_CPER_TYPE_RUNTIME:
92 		if (sev == RAS_CPER_SEV_NON_FATAL_CE)
93 			hdr->notify_type = CMC;
94 		else
95 			hdr->notify_type = MCE;
96 		break;
97 	default:
98 		RAS_DEV_ERR(ras_core->dev, "Unknown CPER Type\n");
99 		break;
100 	}
101 }
102 
103 static int fill_section_descriptor(struct ras_core_context *ras_core,
104 					struct cper_section_descriptor *descriptor,
105 					enum ras_cper_severity sev,
106 					struct ras_cper_guid sec_type,
107 					uint32_t section_offset,
108 					uint32_t section_length)
109 {
110 	struct device_system_info dev_info = {0};
111 
112 	descriptor->revision_minor		= CPER_SEC__MINOR_REV_1;
113 	descriptor->revision_major		= CPER_SEC__MAJOR_REV_22;
114 	descriptor->sec_offset		= section_offset;
115 	descriptor->sec_length		= section_length;
116 	descriptor->valid_bits.fru_text	= 1;
117 	descriptor->flag_bits.primary	= 1;
118 	descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev);
119 	descriptor->sec_type			= sec_type;
120 
121 	ras_core_get_device_system_info(ras_core, &dev_info);
122 
123 	snprintf(descriptor->fru_text, 20, "OAM%d", dev_info.socket_id);
124 
125 	if (sev == RAS_CPER_SEV_RMA)
126 		descriptor->flag_bits.exceed_err_threshold = 1;
127 
128 	if (sev == RAS_CPER_SEV_NON_FATAL_UE)
129 		descriptor->flag_bits.latent_err = 1;
130 
131 	return 0;
132 }
133 
134 static int fill_section_fatal(struct ras_core_context *ras_core,
135 		struct cper_section_fatal *fatal, struct ras_log_info *trace)
136 {
137 	fatal->data.reg_ctx_type = CPER_CTX_TYPE__CRASH;
138 	fatal->data.reg_arr_size = sizeof(fatal->data.reg);
139 
140 	fatal->data.reg.status = trace->aca_reg.regs[RAS_CPER_ACA_REG_STATUS];
141 	fatal->data.reg.addr   = trace->aca_reg.regs[RAS_CPER_ACA_REG_ADDR];
142 	fatal->data.reg.ipid   = trace->aca_reg.regs[RAS_CPER_ACA_REG_IPID];
143 	fatal->data.reg.synd   = trace->aca_reg.regs[RAS_CPER_ACA_REG_SYND];
144 
145 	return 0;
146 }
147 
148 static int fill_section_runtime(struct ras_core_context *ras_core,
149 		struct cper_section_runtime *runtime, struct ras_log_info *trace,
150 		enum ras_cper_severity sev)
151 {
152 	runtime->hdr.valid_bits.err_info_cnt = 1;
153 	runtime->hdr.valid_bits.err_context_cnt = 1;
154 
155 	runtime->descriptor.error_type = RUNTIME;
156 	runtime->descriptor.ms_chk_bits.err_type_valid = 1;
157 	if (sev == RAS_CPER_SEV_RMA) {
158 		runtime->descriptor.valid_bits.ms_chk = 1;
159 		runtime->descriptor.ms_chk_bits.err_type = 1;
160 		runtime->descriptor.ms_chk_bits.pcc = 1;
161 	}
162 
163 	runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH;
164 	runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump);
165 
166 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_CTL]    = trace->aca_reg.regs[ACA_REG_IDX__CTL];
167 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_STATUS] = trace->aca_reg.regs[ACA_REG_IDX__STATUS];
168 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_ADDR]   = trace->aca_reg.regs[ACA_REG_IDX__ADDR];
169 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_MISC0]  = trace->aca_reg.regs[ACA_REG_IDX__MISC0];
170 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_CONFIG] = trace->aca_reg.regs[ACA_REG_IDX__CONFG];
171 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_IPID]   = trace->aca_reg.regs[ACA_REG_IDX__IPID];
172 	runtime->reg.reg_dump[RAS_CPER_ACA_REG_SYND]   = trace->aca_reg.regs[ACA_REG_IDX__SYND];
173 
174 	return 0;
175 }
176 
177 static int cper_generate_runtime_record(struct ras_core_context *ras_core,
178 	struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num,
179 		enum ras_cper_severity sev)
180 {
181 	struct cper_section_descriptor *descriptor;
182 	struct cper_section_runtime *runtime;
183 	int i;
184 
185 	fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, trace_arr[0]);
186 	hdr->record_length =  RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num);
187 	hdr->sec_cnt = arr_num;
188 	for (i = 0; i < arr_num; i++) {
189 		descriptor = (struct cper_section_descriptor *)((uint8_t *)hdr +
190 			     RAS_SEC_DESC_OFFSET(i));
191 		runtime = (struct cper_section_runtime *)((uint8_t *)hdr +
192 			  RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i));
193 
194 		fill_section_descriptor(ras_core, descriptor, sev, RUNTIME,
195 			RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i),
196 			sizeof(struct cper_section_runtime));
197 		fill_section_runtime(ras_core, runtime, trace_arr[i], sev);
198 	}
199 
200 	return 0;
201 }
202 
203 static int cper_generate_fatal_record(struct ras_core_context *ras_core,
204 	uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num)
205 {
206 	struct ras_cper_fatal_record record = {0};
207 	int i = 0;
208 
209 	for (i = 0; i < arr_num; i++) {
210 		fill_section_hdr(ras_core, &record.hdr, RAS_CPER_TYPE_FATAL,
211 				 RAS_CPER_SEV_FATAL_UE, trace_arr[i]);
212 		record.hdr.record_length =  RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN;
213 		record.hdr.sec_cnt = 1;
214 
215 		fill_section_descriptor(ras_core, &record.descriptor, RAS_CPER_SEV_FATAL_UE,
216 					CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal),
217 					sizeof(struct cper_section_fatal));
218 
219 		fill_section_fatal(ras_core, &record.fatal, trace_arr[i]);
220 
221 		memcpy(buffer + (i * record.hdr.record_length),
222 				&record, record.hdr.record_length);
223 	}
224 
225 	return 0;
226 }
227 
228 static int cper_get_record_size(enum ras_cper_type type, uint16_t section_count)
229 {
230 	int size = 0;
231 
232 	size += RAS_HDR_LEN;
233 	size += (RAS_SEC_DESC_LEN * section_count);
234 
235 	switch (type) {
236 	case RAS_CPER_TYPE_RUNTIME:
237 	case RAS_CPER_TYPE_RMA:
238 		size += (RAS_NONSTD_SEC_LEN * section_count);
239 		break;
240 	case RAS_CPER_TYPE_FATAL:
241 		size += (RAS_FATAL_SEC_LEN * section_count);
242 		size += (RAS_HDR_LEN * (section_count - 1));
243 		break;
244 	case RAS_CPER_TYPE_BOOT:
245 		size += (RAS_BOOT_SEC_LEN * section_count);
246 		break;
247 	default:
248 		/* should never reach here */
249 		break;
250 	}
251 
252 	return size;
253 }
254 
255 static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event event)
256 {
257 	switch (event) {
258 	case RAS_LOG_EVENT_UE:
259 		return RAS_CPER_TYPE_FATAL;
260 	case RAS_LOG_EVENT_DE:
261 	case RAS_LOG_EVENT_CE:
262 	case RAS_LOG_EVENT_POISON_CREATION:
263 	case RAS_LOG_EVENT_POISON_CONSUMPTION:
264 		return RAS_CPER_TYPE_RUNTIME;
265 	case RAS_LOG_EVENT_RMA:
266 		return RAS_CPER_TYPE_RMA;
267 	default:
268 		/* should never reach here */
269 		return RAS_CPER_TYPE_RUNTIME;
270 	}
271 }
272 
273 int ras_cper_generate_cper(struct ras_core_context *ras_core,
274 		struct ras_log_info **trace_list, uint32_t count,
275 		uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len)
276 {
277 	uint8_t *buffer = buf;
278 	uint64_t buf_size = buf_len;
279 	int record_size, saved_size = 0;
280 	struct cper_section_hdr *hdr;
281 
282 	/* All the batch traces share the same event */
283 	record_size = cper_get_record_size(
284 			cper_ras_log_event_to_cper_type(trace_list[0]->event), count);
285 
286 	if ((record_size + saved_size) > buf_size)
287 		return -ENOMEM;
288 
289 	hdr = (struct cper_section_hdr *)(buffer + saved_size);
290 
291 	switch (trace_list[0]->event) {
292 	case RAS_LOG_EVENT_RMA:
293 		cper_generate_runtime_record(ras_core, hdr, trace_list, count, RAS_CPER_SEV_RMA);
294 		break;
295 	case RAS_LOG_EVENT_DE:
296 		cper_generate_runtime_record(ras_core,
297 			hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_UE);
298 		break;
299 	case RAS_LOG_EVENT_CE:
300 		cper_generate_runtime_record(ras_core,
301 			hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_CE);
302 		break;
303 	case RAS_LOG_EVENT_UE:
304 		cper_generate_fatal_record(ras_core, buffer + saved_size, trace_list, count);
305 		break;
306 	default:
307 		RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0]->event);
308 		break;
309 	}
310 
311 	saved_size += record_size;
312 
313 	*real_data_len = saved_size;
314 	return 0;
315 }
316