xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c (revision 89748acdf226fd1a8775ff6fa2703f8412b286c8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 
27 static const guid_t MCE			= CPER_NOTIFY_MCE;
28 static const guid_t CMC			= CPER_NOTIFY_CMC;
29 static const guid_t BOOT		= BOOT_TYPE;
30 
31 static const guid_t CRASHDUMP		= AMD_CRASHDUMP;
32 static const guid_t RUNTIME		= AMD_GPU_NONSTANDARD_ERROR;
33 
__inc_entry_length(struct cper_hdr * hdr,uint32_t size)34 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
35 {
36 	hdr->record_length += size;
37 }
38 
amdgpu_cper_get_timestamp(struct cper_timestamp * timestamp)39 static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)
40 {
41 	struct tm tm;
42 	time64_t now = ktime_get_real_seconds();
43 
44 	time64_to_tm(now, 0, &tm);
45 	timestamp->seconds = tm.tm_sec;
46 	timestamp->minutes = tm.tm_min;
47 	timestamp->hours = tm.tm_hour;
48 	timestamp->flag = 0;
49 	timestamp->day = tm.tm_mday;
50 	timestamp->month = 1 + tm.tm_mon;
51 	timestamp->year = (1900 + tm.tm_year) % 100;
52 	timestamp->century = (1900 + tm.tm_year) / 100;
53 }
54 
amdgpu_cper_entry_fill_hdr(struct amdgpu_device * adev,struct cper_hdr * hdr,enum amdgpu_cper_type type,enum cper_error_severity sev)55 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
56 				struct cper_hdr *hdr,
57 				enum amdgpu_cper_type type,
58 				enum cper_error_severity sev)
59 {
60 	char record_id[16];
61 
62 	hdr->signature[0]		= 'C';
63 	hdr->signature[1]		= 'P';
64 	hdr->signature[2]		= 'E';
65 	hdr->signature[3]		= 'R';
66 	hdr->revision			= CPER_HDR_REV_1;
67 	hdr->signature_end		= 0xFFFFFFFF;
68 	hdr->error_severity		= sev;
69 
70 	hdr->valid_bits.platform_id	= 1;
71 	hdr->valid_bits.partition_id	= 1;
72 	hdr->valid_bits.timestamp	= 1;
73 
74 	amdgpu_cper_get_timestamp(&hdr->timestamp);
75 
76 	snprintf(record_id, 9, "%d:%X",
77 		 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
78 			 adev->smuio.funcs->get_socket_id(adev) :
79 			 0,
80 		 atomic_inc_return(&adev->cper.unique_id));
81 	memcpy(hdr->record_id, record_id, 8);
82 
83 	snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
84 		 adev->pdev->vendor, adev->pdev->device);
85 	/* pmfw version should be part of creator_id according to CPER spec */
86 	snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
87 
88 	switch (type) {
89 	case AMDGPU_CPER_TYPE_BOOT:
90 		hdr->notify_type = BOOT;
91 		break;
92 	case AMDGPU_CPER_TYPE_FATAL:
93 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
94 		hdr->notify_type = MCE;
95 		break;
96 	case AMDGPU_CPER_TYPE_RUNTIME:
97 		if (sev == CPER_SEV_NON_FATAL_CORRECTED)
98 			hdr->notify_type = CMC;
99 		else
100 			hdr->notify_type = MCE;
101 		break;
102 	default:
103 		dev_err(adev->dev, "Unknown CPER Type\n");
104 		break;
105 	}
106 
107 	__inc_entry_length(hdr, HDR_LEN);
108 }
109 
amdgpu_cper_entry_fill_section_desc(struct amdgpu_device * adev,struct cper_sec_desc * section_desc,bool bp_threshold,bool poison,enum cper_error_severity sev,guid_t sec_type,uint32_t section_length,uint32_t section_offset)110 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
111 					       struct cper_sec_desc *section_desc,
112 					       bool bp_threshold,
113 					       bool poison,
114 					       enum cper_error_severity sev,
115 					       guid_t sec_type,
116 					       uint32_t section_length,
117 					       uint32_t section_offset)
118 {
119 	section_desc->revision_minor		= CPER_SEC_MINOR_REV_1;
120 	section_desc->revision_major		= CPER_SEC_MAJOR_REV_22;
121 	section_desc->sec_offset		= section_offset;
122 	section_desc->sec_length		= section_length;
123 	section_desc->valid_bits.fru_text	= 1;
124 	section_desc->flag_bits.primary		= 1;
125 	section_desc->severity			= sev;
126 	section_desc->sec_type			= sec_type;
127 
128 	snprintf(section_desc->fru_text, 20, "OAM%d",
129 		 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
130 			 adev->smuio.funcs->get_socket_id(adev) :
131 			 0);
132 
133 	if (bp_threshold)
134 		section_desc->flag_bits.exceed_err_threshold = 1;
135 	if (poison)
136 		section_desc->flag_bits.latent_err = 1;
137 
138 	return 0;
139 }
140 
amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,struct cper_sec_crashdump_reg_data reg_data)141 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
142 					 struct cper_hdr *hdr,
143 					 uint32_t idx,
144 					 struct cper_sec_crashdump_reg_data reg_data)
145 {
146 	struct cper_sec_desc *section_desc;
147 	struct cper_sec_crashdump_fatal *section;
148 
149 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
150 	section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
151 		   FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
152 
153 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
154 					    CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
155 					    FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
156 
157 	section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
158 	section->body.reg_arr_size = sizeof(reg_data);
159 	section->body.data = reg_data;
160 
161 	__inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
162 
163 	return 0;
164 }
165 
amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,enum cper_error_severity sev,uint32_t * reg_dump,uint32_t reg_count)166 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
167 					   struct cper_hdr *hdr,
168 					   uint32_t idx,
169 					   enum cper_error_severity sev,
170 					   uint32_t *reg_dump,
171 					   uint32_t reg_count)
172 {
173 	struct cper_sec_desc *section_desc;
174 	struct cper_sec_nonstd_err *section;
175 	bool poison;
176 
177 	poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true;
178 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
179 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
180 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
181 
182 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
183 					    sev, RUNTIME, NONSTD_SEC_LEN,
184 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
185 
186 	reg_count = umin(reg_count, CPER_ACA_REG_COUNT);
187 
188 	section->hdr.valid_bits.err_info_cnt = 1;
189 	section->hdr.valid_bits.err_context_cnt = 1;
190 
191 	section->info.error_type = RUNTIME;
192 	section->info.ms_chk_bits.err_type_valid = 1;
193 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
194 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
195 
196 	memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
197 
198 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
199 
200 	return 0;
201 }
202 
amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx)203 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
204 						      struct cper_hdr *hdr,
205 						      uint32_t idx)
206 {
207 	struct cper_sec_desc *section_desc;
208 	struct cper_sec_nonstd_err *section;
209 
210 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
211 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
212 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
213 
214 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
215 					    CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
216 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
217 
218 	section->hdr.valid_bits.err_info_cnt = 1;
219 	section->hdr.valid_bits.err_context_cnt = 1;
220 
221 	section->info.error_type = RUNTIME;
222 	section->info.ms_chk_bits.err_type_valid = 1;
223 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
224 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
225 
226 	/* Hardcoded Reg dump for bad page threshold CPER */
227 	section->ctx.reg_dump[CPER_ACA_REG_CTL_LO]    = 0x1;
228 	section->ctx.reg_dump[CPER_ACA_REG_CTL_HI]    = 0x0;
229 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
230 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
231 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO]   = 0x0;
232 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI]   = 0x0;
233 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO]  = 0x0;
234 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI]  = 0x0;
235 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
236 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
237 	section->ctx.reg_dump[CPER_ACA_REG_IPID_LO]   = 0x0;
238 	section->ctx.reg_dump[CPER_ACA_REG_IPID_HI]   = 0x96;
239 	section->ctx.reg_dump[CPER_ACA_REG_SYND_LO]   = 0x0;
240 	section->ctx.reg_dump[CPER_ACA_REG_SYND_HI]   = 0x0;
241 
242 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
243 
244 	return 0;
245 }
246 
amdgpu_cper_alloc_entry(struct amdgpu_device * adev,enum amdgpu_cper_type type,uint16_t section_count)247 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
248 					 enum amdgpu_cper_type type,
249 					 uint16_t section_count)
250 {
251 	struct cper_hdr *hdr;
252 	uint32_t size = 0;
253 
254 	size += HDR_LEN;
255 	size += (SEC_DESC_LEN * section_count);
256 
257 	switch (type) {
258 	case AMDGPU_CPER_TYPE_RUNTIME:
259 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
260 		size += (NONSTD_SEC_LEN * section_count);
261 		break;
262 	case AMDGPU_CPER_TYPE_FATAL:
263 		size += (FATAL_SEC_LEN * section_count);
264 		break;
265 	case AMDGPU_CPER_TYPE_BOOT:
266 		size += (BOOT_SEC_LEN * section_count);
267 		break;
268 	default:
269 		dev_err(adev->dev, "Unknown CPER Type!\n");
270 		return NULL;
271 	}
272 
273 	hdr = kzalloc(size, GFP_KERNEL);
274 	if (!hdr)
275 		return NULL;
276 
277 	/* Save this early */
278 	hdr->sec_cnt = section_count;
279 
280 	return hdr;
281 }
282 
amdgpu_cper_generate_ue_record(struct amdgpu_device * adev,struct aca_bank * bank)283 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
284 				   struct aca_bank *bank)
285 {
286 	struct cper_hdr *fatal = NULL;
287 	struct cper_sec_crashdump_reg_data reg_data = { 0 };
288 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
289 	int ret;
290 
291 	fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
292 	if (!fatal) {
293 		dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
294 		return -ENOMEM;
295 	}
296 
297 	reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
298 	reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
299 	reg_data.addr_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
300 	reg_data.addr_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
301 	reg_data.ipid_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
302 	reg_data.ipid_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
303 	reg_data.synd_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
304 	reg_data.synd_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
305 
306 	amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
307 	ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
308 	if (ret)
309 		return ret;
310 
311 	amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
312 	kfree(fatal);
313 
314 	return 0;
315 }
316 
amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device * adev)317 int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
318 {
319 	struct cper_hdr *bp_threshold = NULL;
320 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
321 	int ret;
322 
323 	bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);
324 	if (!bp_threshold) {
325 		dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
326 		return -ENOMEM;
327 	}
328 
329 	amdgpu_cper_entry_fill_hdr(adev, bp_threshold,
330 				   AMDGPU_CPER_TYPE_BP_THRESHOLD,
331 				   CPER_SEV_FATAL);
332 	ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
333 	if (ret)
334 		return ret;
335 
336 	amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
337 	kfree(bp_threshold);
338 
339 	return 0;
340 }
341 
amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device * adev,enum aca_error_type aca_err_type)342 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
343 								enum aca_error_type aca_err_type)
344 {
345 	switch (aca_err_type) {
346 	case ACA_ERROR_TYPE_UE:
347 		return CPER_SEV_FATAL;
348 	case ACA_ERROR_TYPE_CE:
349 		return CPER_SEV_NON_FATAL_CORRECTED;
350 	case ACA_ERROR_TYPE_DEFERRED:
351 		return CPER_SEV_NON_FATAL_UNCORRECTED;
352 	default:
353 		dev_err(adev->dev, "Unknown ACA error type!\n");
354 		return CPER_SEV_FATAL;
355 	}
356 }
357 
amdgpu_cper_generate_ce_records(struct amdgpu_device * adev,struct aca_banks * banks,uint16_t bank_count)358 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
359 				    struct aca_banks *banks,
360 				    uint16_t bank_count)
361 {
362 	struct cper_hdr *corrected = NULL;
363 	enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
364 	struct amdgpu_ring *ring = &adev->cper.ring_buf;
365 	uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
366 	struct aca_bank_node *node;
367 	struct aca_bank *bank;
368 	uint32_t i = 0;
369 	int ret;
370 
371 	corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
372 	if (!corrected) {
373 		dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
374 		return -ENOMEM;
375 	}
376 
377 	/* Raise severity if any DE is detected in the ACA bank list */
378 	list_for_each_entry(node, &banks->list, node) {
379 		bank = &node->bank;
380 		if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
381 			sev = CPER_SEV_NON_FATAL_UNCORRECTED;
382 			break;
383 		}
384 	}
385 
386 	amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
387 
388 	/* Combine CE and DE in cper record */
389 	list_for_each_entry(node, &banks->list, node) {
390 		bank = &node->bank;
391 		reg_data[CPER_ACA_REG_CTL_LO]    = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
392 		reg_data[CPER_ACA_REG_CTL_HI]    = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
393 		reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
394 		reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
395 		reg_data[CPER_ACA_REG_ADDR_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
396 		reg_data[CPER_ACA_REG_ADDR_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
397 		reg_data[CPER_ACA_REG_MISC0_LO]  = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
398 		reg_data[CPER_ACA_REG_MISC0_HI]  = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
399 		reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
400 		reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
401 		reg_data[CPER_ACA_REG_IPID_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
402 		reg_data[CPER_ACA_REG_IPID_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
403 		reg_data[CPER_ACA_REG_SYND_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
404 		reg_data[CPER_ACA_REG_SYND_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
405 
406 		ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
407 				amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
408 				reg_data, CPER_ACA_REG_COUNT);
409 		if (ret)
410 			return ret;
411 	}
412 
413 	amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
414 	kfree(corrected);
415 
416 	return 0;
417 }
418 
amdgpu_cper_is_hdr(struct amdgpu_ring * ring,u64 pos)419 static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
420 {
421 	struct cper_hdr *chdr;
422 
423 	chdr = (struct cper_hdr *)&(ring->ring[pos]);
424 	return strcmp(chdr->signature, "CPER") ? false : true;
425 }
426 
amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring * ring,u64 pos)427 static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
428 {
429 	struct cper_hdr *chdr;
430 	u64 p;
431 	u32 chunk, rec_len = 0;
432 
433 	chdr = (struct cper_hdr *)&(ring->ring[pos]);
434 	chunk = ring->ring_size - (pos << 2);
435 
436 	if (!strcmp(chdr->signature, "CPER")) {
437 		rec_len = chdr->record_length;
438 		goto calc;
439 	}
440 
441 	/* ring buffer is not full, no cper data after ring->wptr */
442 	if (ring->count_dw)
443 		goto calc;
444 
445 	for (p = pos + 1; p <= ring->buf_mask; p++) {
446 		chdr = (struct cper_hdr *)&(ring->ring[p]);
447 		if (!strcmp(chdr->signature, "CPER")) {
448 			rec_len = (p - pos) << 2;
449 			goto calc;
450 		}
451 	}
452 
453 calc:
454 	if (!rec_len)
455 		return chunk;
456 	else
457 		return umin(rec_len, chunk);
458 }
459 
amdgpu_cper_ring_write(struct amdgpu_ring * ring,void * src,int count)460 void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
461 {
462 	u64 pos, wptr_old, rptr;
463 	int rec_cnt_dw = count >> 2;
464 	u32 chunk, ent_sz;
465 	u8 *s = (u8 *)src;
466 
467 	if (count >= ring->ring_size - 4) {
468 		dev_err(ring->adev->dev,
469 			"CPER data size(%d) is larger than ring size(%d)\n",
470 			count, ring->ring_size - 4);
471 
472 		return;
473 	}
474 
475 	mutex_lock(&ring->adev->cper.ring_lock);
476 
477 	wptr_old = ring->wptr;
478 	rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
479 
480 	while (count) {
481 		ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
482 		chunk = umin(ent_sz, count);
483 
484 		memcpy(&ring->ring[ring->wptr], s, chunk);
485 
486 		ring->wptr += (chunk >> 2);
487 		ring->wptr &= ring->ptr_mask;
488 		count -= chunk;
489 		s += chunk;
490 	}
491 
492 	if (ring->count_dw < rec_cnt_dw)
493 		ring->count_dw = 0;
494 
495 	/* the buffer is overflow, adjust rptr */
496 	if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
497 	    ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
498 	    ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {
499 		pos = (ring->wptr + 1) & ring->ptr_mask;
500 
501 		do {
502 			ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
503 
504 			rptr += (ent_sz >> 2);
505 			rptr &= ring->ptr_mask;
506 			*ring->rptr_cpu_addr = rptr;
507 
508 			pos = rptr;
509 		} while (!amdgpu_cper_is_hdr(ring, rptr));
510 	}
511 
512 	if (ring->count_dw >= rec_cnt_dw)
513 		ring->count_dw -= rec_cnt_dw;
514 	mutex_unlock(&ring->adev->cper.ring_lock);
515 }
516 
amdgpu_cper_ring_get_rptr(struct amdgpu_ring * ring)517 static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
518 {
519 	return *(ring->rptr_cpu_addr);
520 }
521 
amdgpu_cper_ring_get_wptr(struct amdgpu_ring * ring)522 static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)
523 {
524 	return ring->wptr;
525 }
526 
527 static const struct amdgpu_ring_funcs cper_ring_funcs = {
528 	.type = AMDGPU_RING_TYPE_CPER,
529 	.align_mask = 0xff,
530 	.support_64bit_ptrs = false,
531 	.get_rptr = amdgpu_cper_ring_get_rptr,
532 	.get_wptr = amdgpu_cper_ring_get_wptr,
533 };
534 
amdgpu_cper_ring_init(struct amdgpu_device * adev)535 static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
536 {
537 	struct amdgpu_ring *ring = &(adev->cper.ring_buf);
538 
539 	mutex_init(&adev->cper.ring_lock);
540 
541 	ring->adev = NULL;
542 	ring->ring_obj = NULL;
543 	ring->use_doorbell = false;
544 	ring->no_scheduler = true;
545 	ring->funcs = &cper_ring_funcs;
546 
547 	sprintf(ring->name, "cper");
548 	return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,
549 				AMDGPU_RING_PRIO_DEFAULT, NULL);
550 }
551 
amdgpu_cper_init(struct amdgpu_device * adev)552 int amdgpu_cper_init(struct amdgpu_device *adev)
553 {
554 	int r;
555 
556 	if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
557 		return 0;
558 
559 	r = amdgpu_cper_ring_init(adev);
560 	if (r) {
561 		dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);
562 		return r;
563 	}
564 
565 	mutex_init(&adev->cper.cper_lock);
566 
567 	adev->cper.enabled = true;
568 	adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
569 
570 	return 0;
571 }
572 
amdgpu_cper_fini(struct amdgpu_device * adev)573 int amdgpu_cper_fini(struct amdgpu_device *adev)
574 {
575 	if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
576 		return 0;
577 
578 	adev->cper.enabled = false;
579 
580 	amdgpu_ring_fini(&(adev->cper.ring_buf));
581 	adev->cper.count = 0;
582 	adev->cper.wptr = 0;
583 
584 	return 0;
585 }
586