xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c (revision 92c4c9fdc838d3b41a996bb700ea64b9e78fc7ea)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 #include "amdgpu_ras_mgr.h"
27 
28 static const guid_t MCE			= CPER_NOTIFY_MCE;
29 static const guid_t CMC			= CPER_NOTIFY_CMC;
30 static const guid_t BOOT		= BOOT_TYPE;
31 
32 static const guid_t CRASHDUMP		= AMD_CRASHDUMP;
33 static const guid_t RUNTIME		= AMD_GPU_NONSTANDARD_ERROR;
34 
35 #define CPER_SIGNATURE_SZ		(sizeof(((struct cper_hdr *)0)->signature))
36 
__inc_entry_length(struct cper_hdr * hdr,uint32_t size)37 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
38 {
39 	hdr->record_length += size;
40 }
41 
amdgpu_cper_get_timestamp(struct cper_timestamp * timestamp)42 static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)
43 {
44 	struct tm tm;
45 	time64_t now = ktime_get_real_seconds();
46 
47 	time64_to_tm(now, 0, &tm);
48 	timestamp->seconds = tm.tm_sec;
49 	timestamp->minutes = tm.tm_min;
50 	timestamp->hours = tm.tm_hour;
51 	timestamp->flag = 0;
52 	timestamp->day = tm.tm_mday;
53 	timestamp->month = 1 + tm.tm_mon;
54 	timestamp->year = (1900 + tm.tm_year) % 100;
55 	timestamp->century = (1900 + tm.tm_year) / 100;
56 }
57 
amdgpu_cper_entry_fill_hdr(struct amdgpu_device * adev,struct cper_hdr * hdr,enum amdgpu_cper_type type,enum cper_error_severity sev)58 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
59 				struct cper_hdr *hdr,
60 				enum amdgpu_cper_type type,
61 				enum cper_error_severity sev)
62 {
63 	char record_id[16];
64 
65 	hdr->signature[0]		= 'C';
66 	hdr->signature[1]		= 'P';
67 	hdr->signature[2]		= 'E';
68 	hdr->signature[3]		= 'R';
69 	hdr->revision			= CPER_HDR_REV_1;
70 	hdr->signature_end		= 0xFFFFFFFF;
71 	hdr->error_severity		= sev;
72 
73 	hdr->valid_bits.platform_id	= 1;
74 	hdr->valid_bits.timestamp	= 1;
75 
76 	amdgpu_cper_get_timestamp(&hdr->timestamp);
77 
78 	snprintf(record_id, 9, "%d:%X",
79 		 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
80 			 adev->smuio.funcs->get_socket_id(adev) :
81 			 0,
82 		 atomic_inc_return(&adev->cper.unique_id));
83 	memcpy(hdr->record_id, record_id, 8);
84 
85 	snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
86 		 adev->pdev->vendor, adev->pdev->device);
87 	/* pmfw version should be part of creator_id according to CPER spec */
88 	snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
89 
90 	switch (type) {
91 	case AMDGPU_CPER_TYPE_BOOT:
92 		hdr->notify_type = BOOT;
93 		break;
94 	case AMDGPU_CPER_TYPE_FATAL:
95 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
96 		hdr->notify_type = MCE;
97 		break;
98 	case AMDGPU_CPER_TYPE_RUNTIME:
99 		if (sev == CPER_SEV_NON_FATAL_CORRECTED)
100 			hdr->notify_type = CMC;
101 		else
102 			hdr->notify_type = MCE;
103 		break;
104 	default:
105 		dev_err(adev->dev, "Unknown CPER Type\n");
106 		break;
107 	}
108 
109 	__inc_entry_length(hdr, HDR_LEN);
110 }
111 
amdgpu_cper_entry_fill_section_desc(struct amdgpu_device * adev,struct cper_sec_desc * section_desc,bool bp_threshold,bool poison,enum cper_error_severity sev,guid_t sec_type,uint32_t section_length,uint32_t section_offset)112 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
113 					       struct cper_sec_desc *section_desc,
114 					       bool bp_threshold,
115 					       bool poison,
116 					       enum cper_error_severity sev,
117 					       guid_t sec_type,
118 					       uint32_t section_length,
119 					       uint32_t section_offset)
120 {
121 	section_desc->revision_minor		= CPER_SEC_MINOR_REV_1;
122 	section_desc->revision_major		= CPER_SEC_MAJOR_REV_22;
123 	section_desc->sec_offset		= section_offset;
124 	section_desc->sec_length		= section_length;
125 	section_desc->valid_bits.fru_text	= 1;
126 	section_desc->flag_bits.primary		= 1;
127 	section_desc->severity			= sev;
128 	section_desc->sec_type			= sec_type;
129 
130 	snprintf(section_desc->fru_text, 20, "OAM%d",
131 		 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
132 			 adev->smuio.funcs->get_socket_id(adev) :
133 			 0);
134 
135 	if (bp_threshold)
136 		section_desc->flag_bits.exceed_err_threshold = 1;
137 	if (poison)
138 		section_desc->flag_bits.latent_err = 1;
139 
140 	return 0;
141 }
142 
amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,struct cper_sec_crashdump_reg_data reg_data)143 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
144 					 struct cper_hdr *hdr,
145 					 uint32_t idx,
146 					 struct cper_sec_crashdump_reg_data reg_data)
147 {
148 	struct cper_sec_desc *section_desc;
149 	struct cper_sec_crashdump_fatal *section;
150 
151 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
152 	section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
153 		   FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
154 
155 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
156 					    CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
157 					    FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
158 
159 	section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
160 	section->body.reg_arr_size = sizeof(reg_data);
161 	section->body.data = reg_data;
162 
163 	__inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
164 
165 	return 0;
166 }
167 
amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,enum cper_error_severity sev,uint32_t * reg_dump,uint32_t reg_count)168 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
169 					   struct cper_hdr *hdr,
170 					   uint32_t idx,
171 					   enum cper_error_severity sev,
172 					   uint32_t *reg_dump,
173 					   uint32_t reg_count)
174 {
175 	struct cper_sec_desc *section_desc;
176 	struct cper_sec_nonstd_err *section;
177 	bool poison;
178 
179 	poison = sev != CPER_SEV_NON_FATAL_CORRECTED;
180 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
181 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
182 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
183 
184 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
185 					    sev, RUNTIME, NONSTD_SEC_LEN,
186 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
187 
188 	reg_count = umin(reg_count, CPER_ACA_REG_COUNT);
189 
190 	section->hdr.valid_bits.err_info_cnt = 1;
191 	section->hdr.valid_bits.err_context_cnt = 1;
192 
193 	section->info.error_type = RUNTIME;
194 	section->info.ms_chk_bits.err_type_valid = 1;
195 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
196 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
197 
198 	memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
199 
200 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
201 
202 	return 0;
203 }
204 
amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx)205 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
206 						      struct cper_hdr *hdr,
207 						      uint32_t idx)
208 {
209 	struct cper_sec_desc *section_desc;
210 	struct cper_sec_nonstd_err *section;
211 	uint32_t socket_id;
212 
213 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
214 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
215 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
216 
217 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
218 					    CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
219 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
220 
221 	section->hdr.valid_bits.err_info_cnt = 1;
222 	section->hdr.valid_bits.err_context_cnt = 1;
223 
224 	section->info.error_type = RUNTIME;
225 	section->info.valid_bits.ms_chk = 1;
226 	section->info.ms_chk_bits.err_type_valid = 1;
227 	section->info.ms_chk_bits.err_type = 1;
228 	section->info.ms_chk_bits.pcc = 1;
229 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
230 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
231 
232 	/* Hardcoded Reg dump for bad page threshold CPER */
233 	socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
234 				adev->smuio.funcs->get_socket_id(adev) :
235 				0;
236 	section->ctx.reg_dump[CPER_ACA_REG_CTL_LO]    = 0x1;
237 	section->ctx.reg_dump[CPER_ACA_REG_CTL_HI]    = 0x0;
238 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
239 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
240 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO]   = 0x0;
241 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI]   = 0x0;
242 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO]  = 0x0;
243 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI]  = 0x0;
244 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
245 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
246 	section->ctx.reg_dump[CPER_ACA_REG_IPID_LO]   = (socket_id / 4) & 0x01;
247 	section->ctx.reg_dump[CPER_ACA_REG_IPID_HI]   = 0x096 | (((socket_id % 4) & 0x3) << 12);
248 	section->ctx.reg_dump[CPER_ACA_REG_SYND_LO]   = 0x0;
249 	section->ctx.reg_dump[CPER_ACA_REG_SYND_HI]   = 0x0;
250 
251 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
252 
253 	return 0;
254 }
255 
amdgpu_cper_alloc_entry(struct amdgpu_device * adev,enum amdgpu_cper_type type,uint16_t section_count)256 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
257 					 enum amdgpu_cper_type type,
258 					 uint16_t section_count)
259 {
260 	struct cper_hdr *hdr;
261 	uint32_t size = 0;
262 
263 	size += HDR_LEN;
264 	size += (SEC_DESC_LEN * section_count);
265 
266 	switch (type) {
267 	case AMDGPU_CPER_TYPE_RUNTIME:
268 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
269 		size += (NONSTD_SEC_LEN * section_count);
270 		break;
271 	case AMDGPU_CPER_TYPE_FATAL:
272 		size += (FATAL_SEC_LEN * section_count);
273 		break;
274 	case AMDGPU_CPER_TYPE_BOOT:
275 		size += (BOOT_SEC_LEN * section_count);
276 		break;
277 	default:
278 		dev_err(adev->dev, "Unknown CPER Type!\n");
279 		return NULL;
280 	}
281 
282 	hdr = kzalloc(size, GFP_KERNEL);
283 	if (!hdr)
284 		return NULL;
285 
286 	/* Save this early */
287 	hdr->sec_cnt = section_count;
288 
289 	return hdr;
290 }
291 
amdgpu_cper_generate_ue_record(struct amdgpu_device * adev,struct aca_bank * bank)292 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
293 				   struct aca_bank *bank)
294 {
295 	struct cper_hdr *fatal = NULL;
296 	struct cper_sec_crashdump_reg_data reg_data = { 0 };
297 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
298 	int ret;
299 
300 	fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
301 	if (!fatal) {
302 		dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
303 		return -ENOMEM;
304 	}
305 
306 	reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
307 	reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
308 	reg_data.addr_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
309 	reg_data.addr_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
310 	reg_data.ipid_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
311 	reg_data.ipid_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
312 	reg_data.synd_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
313 	reg_data.synd_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
314 
315 	amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
316 	ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
317 	if (ret)
318 		return ret;
319 
320 	amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
321 	kfree(fatal);
322 
323 	return 0;
324 }
325 
amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device * adev)326 int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
327 {
328 	struct cper_hdr *bp_threshold = NULL;
329 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
330 	int ret;
331 
332 	bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);
333 	if (!bp_threshold) {
334 		dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
335 		return -ENOMEM;
336 	}
337 
338 	amdgpu_cper_entry_fill_hdr(adev, bp_threshold,
339 				   AMDGPU_CPER_TYPE_BP_THRESHOLD,
340 				   CPER_SEV_FATAL);
341 	ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
342 	if (ret)
343 		return ret;
344 
345 	amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
346 	kfree(bp_threshold);
347 
348 	return 0;
349 }
350 
amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device * adev,enum aca_error_type aca_err_type)351 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
352 								enum aca_error_type aca_err_type)
353 {
354 	switch (aca_err_type) {
355 	case ACA_ERROR_TYPE_UE:
356 		return CPER_SEV_FATAL;
357 	case ACA_ERROR_TYPE_CE:
358 		return CPER_SEV_NON_FATAL_CORRECTED;
359 	case ACA_ERROR_TYPE_DEFERRED:
360 		return CPER_SEV_NON_FATAL_UNCORRECTED;
361 	default:
362 		dev_err(adev->dev, "Unknown ACA error type!\n");
363 		return CPER_SEV_FATAL;
364 	}
365 }
366 
amdgpu_cper_generate_ce_records(struct amdgpu_device * adev,struct aca_banks * banks,uint16_t bank_count)367 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
368 				    struct aca_banks *banks,
369 				    uint16_t bank_count)
370 {
371 	struct cper_hdr *corrected = NULL;
372 	enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
373 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
374 	uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
375 	struct aca_bank_node *node;
376 	struct aca_bank *bank;
377 	uint32_t i = 0;
378 	int ret;
379 
380 	corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
381 	if (!corrected) {
382 		dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
383 		return -ENOMEM;
384 	}
385 
386 	/* Raise severity if any DE is detected in the ACA bank list */
387 	list_for_each_entry(node, &banks->list, node) {
388 		bank = &node->bank;
389 		if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
390 			sev = CPER_SEV_NON_FATAL_UNCORRECTED;
391 			break;
392 		}
393 	}
394 
395 	amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
396 
397 	/* Combine CE and DE in cper record */
398 	list_for_each_entry(node, &banks->list, node) {
399 		bank = &node->bank;
400 		reg_data[CPER_ACA_REG_CTL_LO]    = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
401 		reg_data[CPER_ACA_REG_CTL_HI]    = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
402 		reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
403 		reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
404 		reg_data[CPER_ACA_REG_ADDR_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
405 		reg_data[CPER_ACA_REG_ADDR_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
406 		reg_data[CPER_ACA_REG_MISC0_LO]  = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
407 		reg_data[CPER_ACA_REG_MISC0_HI]  = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
408 		reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
409 		reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
410 		reg_data[CPER_ACA_REG_IPID_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
411 		reg_data[CPER_ACA_REG_IPID_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
412 		reg_data[CPER_ACA_REG_SYND_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
413 		reg_data[CPER_ACA_REG_SYND_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
414 
415 		ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
416 				amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
417 				reg_data, CPER_ACA_REG_COUNT);
418 		if (ret)
419 			return ret;
420 	}
421 
422 	amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
423 	kfree(corrected);
424 
425 	return 0;
426 }
427 
amdgpu_cper_is_hdr(struct amdgpu_ring * ring,u64 pos)428 static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
429 {
430 	char signature[CPER_SIGNATURE_SZ];
431 
432 	if ((pos << 2) >= ring->ring_size)
433 		return false;
434 
435 	if ((pos << 2) + CPER_SIGNATURE_SZ <= ring->ring_size) {
436 		memcpy(signature, &ring->ring[pos], CPER_SIGNATURE_SZ);
437 	} else {
438 		u32 chunk = ring->ring_size - (pos << 2);
439 
440 		memcpy(signature, &ring->ring[pos], chunk);
441 		memcpy(signature + chunk, ring->ring, CPER_SIGNATURE_SZ - chunk);
442 	}
443 
444 	return !memcmp(signature, "CPER", CPER_SIGNATURE_SZ);
445 }
446 
amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring * ring,u64 pos)447 static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
448 {
449 	struct cper_hdr chdr;
450 	u64 p;
451 	u32 chunk, rec_len = 0;
452 
453 	chunk = ring->ring_size - (pos << 2);
454 
455 	if (amdgpu_cper_is_hdr(ring, pos)) {
456 		if (chunk >= sizeof(chdr)) {
457 			memcpy(&chdr, &ring->ring[pos], sizeof(chdr));
458 		} else {
459 			memcpy(&chdr, &ring->ring[pos], chunk);
460 			memcpy((u8 *)&chdr + chunk, ring->ring, sizeof(chdr) - chunk);
461 		}
462 
463 		rec_len = chdr.record_length;
464 		goto calc;
465 	}
466 
467 	/* ring buffer is not full, no cper data after ring->wptr */
468 	if (ring->count_dw)
469 		goto calc;
470 
471 	for (p = pos + 1; p <= ring->buf_mask; p++) {
472 		if (amdgpu_cper_is_hdr(ring, p)) {
473 			rec_len = (p - pos) << 2;
474 			goto calc;
475 		}
476 	}
477 
478 calc:
479 	if (!rec_len)
480 		return chunk;
481 	else
482 		return umin(rec_len, chunk);
483 }
484 
amdgpu_cper_ring_write(struct amdgpu_ring * ring,void * src,int count)485 void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
486 {
487 	u64 pos, wptr_old, rptr;
488 	int rec_cnt_dw = count >> 2;
489 	u32 chunk, ent_sz;
490 	u8 *s = (u8 *)src;
491 
492 	if (count >= ring->ring_size - 4) {
493 		dev_err(ring->adev->dev,
494 			"CPER data size(%d) is larger than ring size(%d)\n",
495 			count, ring->ring_size - 4);
496 
497 		return;
498 	}
499 
500 	mutex_lock(&ring->adev->cper.ring_lock);
501 
502 	wptr_old = ring->wptr;
503 	rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
504 
505 	while (count) {
506 		ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
507 		chunk = umin(ent_sz, count);
508 
509 		memcpy(&ring->ring[ring->wptr], s, chunk);
510 
511 		ring->wptr += (chunk >> 2);
512 		ring->wptr &= ring->ptr_mask;
513 		count -= chunk;
514 		s += chunk;
515 	}
516 
517 	if (ring->count_dw < rec_cnt_dw)
518 		ring->count_dw = 0;
519 
520 	/* the buffer is overflow, adjust rptr */
521 	if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
522 	    ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
523 	    ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {
524 		pos = (ring->wptr + 1) & ring->ptr_mask;
525 
526 		do {
527 			ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
528 
529 			rptr += (ent_sz >> 2);
530 			rptr &= ring->ptr_mask;
531 			*ring->rptr_cpu_addr = rptr;
532 
533 			pos = rptr;
534 		} while (!amdgpu_cper_is_hdr(ring, rptr));
535 	}
536 
537 	if (ring->count_dw >= rec_cnt_dw)
538 		ring->count_dw -= rec_cnt_dw;
539 	mutex_unlock(&ring->adev->cper.ring_lock);
540 }
541 
amdgpu_cper_ring_get_rptr(struct amdgpu_ring * ring)542 static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
543 {
544 	return *(ring->rptr_cpu_addr);
545 }
546 
amdgpu_cper_ring_get_wptr(struct amdgpu_ring * ring)547 static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)
548 {
549 	return ring->wptr;
550 }
551 
552 static const struct amdgpu_ring_funcs cper_ring_funcs = {
553 	.type = AMDGPU_RING_TYPE_CPER,
554 	.align_mask = 0xff,
555 	.support_64bit_ptrs = false,
556 	.get_rptr = amdgpu_cper_ring_get_rptr,
557 	.get_wptr = amdgpu_cper_ring_get_wptr,
558 };
559 
amdgpu_cper_ring_init(struct amdgpu_device * adev)560 static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
561 {
562 	struct amdgpu_ring *ring = &(adev->cper.ring_buf);
563 
564 	mutex_init(&adev->cper.ring_lock);
565 
566 	ring->adev = NULL;
567 	ring->ring_obj = NULL;
568 	ring->use_doorbell = false;
569 	ring->no_scheduler = true;
570 	ring->funcs = &cper_ring_funcs;
571 
572 	sprintf(ring->name, "cper");
573 	return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,
574 				AMDGPU_RING_PRIO_DEFAULT, NULL);
575 }
576 
amdgpu_cper_init(struct amdgpu_device * adev)577 int amdgpu_cper_init(struct amdgpu_device *adev)
578 {
579 	int r;
580 
581 	if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_cper_en(adev))
582 		return 0;
583 	else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev) &&
584 		!amdgpu_aca_is_enabled(adev))
585 		return 0;
586 
587 	r = amdgpu_cper_ring_init(adev);
588 	if (r) {
589 		dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);
590 		return r;
591 	}
592 
593 	mutex_init(&adev->cper.cper_lock);
594 
595 	adev->cper.enabled = true;
596 	adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
597 
598 	return 0;
599 }
600 
amdgpu_cper_fini(struct amdgpu_device * adev)601 int amdgpu_cper_fini(struct amdgpu_device *adev)
602 {
603 	if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
604 		return 0;
605 
606 	adev->cper.enabled = false;
607 
608 	amdgpu_ring_fini(&(adev->cper.ring_buf));
609 	adev->cper.count = 0;
610 	adev->cper.wptr = 0;
611 
612 	return 0;
613 }
614