1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 #include "amdgpu_ras_mgr.h"
27
28 static const guid_t MCE = CPER_NOTIFY_MCE;
29 static const guid_t CMC = CPER_NOTIFY_CMC;
30 static const guid_t BOOT = BOOT_TYPE;
31
32 static const guid_t CRASHDUMP = AMD_CRASHDUMP;
33 static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR;
34
35 #define CPER_SIGNATURE_SZ (sizeof(((struct cper_hdr *)0)->signature))
36
__inc_entry_length(struct cper_hdr * hdr,uint32_t size)37 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
38 {
39 hdr->record_length += size;
40 }
41
amdgpu_cper_get_timestamp(struct cper_timestamp * timestamp)42 static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)
43 {
44 struct tm tm;
45 time64_t now = ktime_get_real_seconds();
46
47 time64_to_tm(now, 0, &tm);
48 timestamp->seconds = tm.tm_sec;
49 timestamp->minutes = tm.tm_min;
50 timestamp->hours = tm.tm_hour;
51 timestamp->flag = 0;
52 timestamp->day = tm.tm_mday;
53 timestamp->month = 1 + tm.tm_mon;
54 timestamp->year = (1900 + tm.tm_year) % 100;
55 timestamp->century = (1900 + tm.tm_year) / 100;
56 }
57
amdgpu_cper_entry_fill_hdr(struct amdgpu_device * adev,struct cper_hdr * hdr,enum amdgpu_cper_type type,enum cper_error_severity sev)58 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
59 struct cper_hdr *hdr,
60 enum amdgpu_cper_type type,
61 enum cper_error_severity sev)
62 {
63 char record_id[16];
64
65 hdr->signature[0] = 'C';
66 hdr->signature[1] = 'P';
67 hdr->signature[2] = 'E';
68 hdr->signature[3] = 'R';
69 hdr->revision = CPER_HDR_REV_1;
70 hdr->signature_end = 0xFFFFFFFF;
71 hdr->error_severity = sev;
72
73 hdr->valid_bits.platform_id = 1;
74 hdr->valid_bits.timestamp = 1;
75
76 amdgpu_cper_get_timestamp(&hdr->timestamp);
77
78 snprintf(record_id, 9, "%d:%X",
79 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
80 adev->smuio.funcs->get_socket_id(adev) :
81 0,
82 atomic_inc_return(&adev->cper.unique_id));
83 memcpy(hdr->record_id, record_id, 8);
84
85 snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
86 adev->pdev->vendor, adev->pdev->device);
87 /* pmfw version should be part of creator_id according to CPER spec */
88 snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
89
90 switch (type) {
91 case AMDGPU_CPER_TYPE_BOOT:
92 hdr->notify_type = BOOT;
93 break;
94 case AMDGPU_CPER_TYPE_FATAL:
95 case AMDGPU_CPER_TYPE_BP_THRESHOLD:
96 hdr->notify_type = MCE;
97 break;
98 case AMDGPU_CPER_TYPE_RUNTIME:
99 if (sev == CPER_SEV_NON_FATAL_CORRECTED)
100 hdr->notify_type = CMC;
101 else
102 hdr->notify_type = MCE;
103 break;
104 default:
105 dev_err(adev->dev, "Unknown CPER Type\n");
106 break;
107 }
108
109 __inc_entry_length(hdr, HDR_LEN);
110 }
111
amdgpu_cper_entry_fill_section_desc(struct amdgpu_device * adev,struct cper_sec_desc * section_desc,bool bp_threshold,bool poison,enum cper_error_severity sev,guid_t sec_type,uint32_t section_length,uint32_t section_offset)112 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
113 struct cper_sec_desc *section_desc,
114 bool bp_threshold,
115 bool poison,
116 enum cper_error_severity sev,
117 guid_t sec_type,
118 uint32_t section_length,
119 uint32_t section_offset)
120 {
121 section_desc->revision_minor = CPER_SEC_MINOR_REV_1;
122 section_desc->revision_major = CPER_SEC_MAJOR_REV_22;
123 section_desc->sec_offset = section_offset;
124 section_desc->sec_length = section_length;
125 section_desc->valid_bits.fru_text = 1;
126 section_desc->flag_bits.primary = 1;
127 section_desc->severity = sev;
128 section_desc->sec_type = sec_type;
129
130 snprintf(section_desc->fru_text, 20, "OAM%d",
131 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
132 adev->smuio.funcs->get_socket_id(adev) :
133 0);
134
135 if (bp_threshold)
136 section_desc->flag_bits.exceed_err_threshold = 1;
137 if (poison)
138 section_desc->flag_bits.latent_err = 1;
139
140 return 0;
141 }
142
amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,struct cper_sec_crashdump_reg_data reg_data)143 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
144 struct cper_hdr *hdr,
145 uint32_t idx,
146 struct cper_sec_crashdump_reg_data reg_data)
147 {
148 struct cper_sec_desc *section_desc;
149 struct cper_sec_crashdump_fatal *section;
150
151 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
152 section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
153 FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
154
155 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
156 CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
157 FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
158
159 section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
160 section->body.reg_arr_size = sizeof(reg_data);
161 section->body.data = reg_data;
162
163 __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
164
165 return 0;
166 }
167
amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,enum cper_error_severity sev,uint32_t * reg_dump,uint32_t reg_count)168 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
169 struct cper_hdr *hdr,
170 uint32_t idx,
171 enum cper_error_severity sev,
172 uint32_t *reg_dump,
173 uint32_t reg_count)
174 {
175 struct cper_sec_desc *section_desc;
176 struct cper_sec_nonstd_err *section;
177 bool poison;
178
179 poison = sev != CPER_SEV_NON_FATAL_CORRECTED;
180 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
181 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
182 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
183
184 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
185 sev, RUNTIME, NONSTD_SEC_LEN,
186 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
187
188 reg_count = umin(reg_count, CPER_ACA_REG_COUNT);
189
190 section->hdr.valid_bits.err_info_cnt = 1;
191 section->hdr.valid_bits.err_context_cnt = 1;
192
193 section->info.error_type = RUNTIME;
194 section->info.ms_chk_bits.err_type_valid = 1;
195 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
196 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
197
198 memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
199
200 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
201
202 return 0;
203 }
204
amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx)205 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
206 struct cper_hdr *hdr,
207 uint32_t idx)
208 {
209 struct cper_sec_desc *section_desc;
210 struct cper_sec_nonstd_err *section;
211 uint32_t socket_id;
212
213 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
214 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
215 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
216
217 amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
218 CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
219 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
220
221 section->hdr.valid_bits.err_info_cnt = 1;
222 section->hdr.valid_bits.err_context_cnt = 1;
223
224 section->info.error_type = RUNTIME;
225 section->info.valid_bits.ms_chk = 1;
226 section->info.ms_chk_bits.err_type_valid = 1;
227 section->info.ms_chk_bits.err_type = 1;
228 section->info.ms_chk_bits.pcc = 1;
229 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
230 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
231
232 /* Hardcoded Reg dump for bad page threshold CPER */
233 socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
234 adev->smuio.funcs->get_socket_id(adev) :
235 0;
236 section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1;
237 section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0;
238 section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
239 section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
240 section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0;
241 section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0;
242 section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0;
243 section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0;
244 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
245 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
246 section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = (socket_id / 4) & 0x01;
247 section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x096 | (((socket_id % 4) & 0x3) << 12);
248 section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0;
249 section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0;
250
251 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
252
253 return 0;
254 }
255
amdgpu_cper_alloc_entry(struct amdgpu_device * adev,enum amdgpu_cper_type type,uint16_t section_count)256 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
257 enum amdgpu_cper_type type,
258 uint16_t section_count)
259 {
260 struct cper_hdr *hdr;
261 uint32_t size = 0;
262
263 size += HDR_LEN;
264 size += (SEC_DESC_LEN * section_count);
265
266 switch (type) {
267 case AMDGPU_CPER_TYPE_RUNTIME:
268 case AMDGPU_CPER_TYPE_BP_THRESHOLD:
269 size += (NONSTD_SEC_LEN * section_count);
270 break;
271 case AMDGPU_CPER_TYPE_FATAL:
272 size += (FATAL_SEC_LEN * section_count);
273 break;
274 case AMDGPU_CPER_TYPE_BOOT:
275 size += (BOOT_SEC_LEN * section_count);
276 break;
277 default:
278 dev_err(adev->dev, "Unknown CPER Type!\n");
279 return NULL;
280 }
281
282 hdr = kzalloc(size, GFP_KERNEL);
283 if (!hdr)
284 return NULL;
285
286 /* Save this early */
287 hdr->sec_cnt = section_count;
288
289 return hdr;
290 }
291
amdgpu_cper_generate_ue_record(struct amdgpu_device * adev,struct aca_bank * bank)292 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
293 struct aca_bank *bank)
294 {
295 struct cper_hdr *fatal = NULL;
296 struct cper_sec_crashdump_reg_data reg_data = { 0 };
297 struct amdgpu_ring *ring = &adev->cper.ring_buf;
298 int ret;
299
300 fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
301 if (!fatal) {
302 dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
303 return -ENOMEM;
304 }
305
306 reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
307 reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
308 reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
309 reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
310 reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
311 reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
312 reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
313 reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
314
315 amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
316 ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
317 if (ret)
318 return ret;
319
320 amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
321 kfree(fatal);
322
323 return 0;
324 }
325
amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device * adev)326 int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
327 {
328 struct cper_hdr *bp_threshold = NULL;
329 struct amdgpu_ring *ring = &adev->cper.ring_buf;
330 int ret;
331
332 bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);
333 if (!bp_threshold) {
334 dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
335 return -ENOMEM;
336 }
337
338 amdgpu_cper_entry_fill_hdr(adev, bp_threshold,
339 AMDGPU_CPER_TYPE_BP_THRESHOLD,
340 CPER_SEV_FATAL);
341 ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
342 if (ret)
343 return ret;
344
345 amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
346 kfree(bp_threshold);
347
348 return 0;
349 }
350
amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device * adev,enum aca_error_type aca_err_type)351 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
352 enum aca_error_type aca_err_type)
353 {
354 switch (aca_err_type) {
355 case ACA_ERROR_TYPE_UE:
356 return CPER_SEV_FATAL;
357 case ACA_ERROR_TYPE_CE:
358 return CPER_SEV_NON_FATAL_CORRECTED;
359 case ACA_ERROR_TYPE_DEFERRED:
360 return CPER_SEV_NON_FATAL_UNCORRECTED;
361 default:
362 dev_err(adev->dev, "Unknown ACA error type!\n");
363 return CPER_SEV_FATAL;
364 }
365 }
366
amdgpu_cper_generate_ce_records(struct amdgpu_device * adev,struct aca_banks * banks,uint16_t bank_count)367 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
368 struct aca_banks *banks,
369 uint16_t bank_count)
370 {
371 struct cper_hdr *corrected = NULL;
372 enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
373 struct amdgpu_ring *ring = &adev->cper.ring_buf;
374 uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
375 struct aca_bank_node *node;
376 struct aca_bank *bank;
377 uint32_t i = 0;
378 int ret;
379
380 corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
381 if (!corrected) {
382 dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
383 return -ENOMEM;
384 }
385
386 /* Raise severity if any DE is detected in the ACA bank list */
387 list_for_each_entry(node, &banks->list, node) {
388 bank = &node->bank;
389 if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
390 sev = CPER_SEV_NON_FATAL_UNCORRECTED;
391 break;
392 }
393 }
394
395 amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
396
397 /* Combine CE and DE in cper record */
398 list_for_each_entry(node, &banks->list, node) {
399 bank = &node->bank;
400 reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
401 reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
402 reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
403 reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
404 reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
405 reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
406 reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
407 reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
408 reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
409 reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
410 reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
411 reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
412 reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
413 reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
414
415 ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
416 amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
417 reg_data, CPER_ACA_REG_COUNT);
418 if (ret)
419 return ret;
420 }
421
422 amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
423 kfree(corrected);
424
425 return 0;
426 }
427
amdgpu_cper_is_hdr(struct amdgpu_ring * ring,u64 pos)428 static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
429 {
430 char signature[CPER_SIGNATURE_SZ];
431
432 if ((pos << 2) >= ring->ring_size)
433 return false;
434
435 if ((pos << 2) + CPER_SIGNATURE_SZ <= ring->ring_size) {
436 memcpy(signature, &ring->ring[pos], CPER_SIGNATURE_SZ);
437 } else {
438 u32 chunk = ring->ring_size - (pos << 2);
439
440 memcpy(signature, &ring->ring[pos], chunk);
441 memcpy(signature + chunk, ring->ring, CPER_SIGNATURE_SZ - chunk);
442 }
443
444 return !memcmp(signature, "CPER", CPER_SIGNATURE_SZ);
445 }
446
amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring * ring,u64 pos)447 static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
448 {
449 struct cper_hdr chdr;
450 u64 p;
451 u32 chunk, rec_len = 0;
452
453 chunk = ring->ring_size - (pos << 2);
454
455 if (amdgpu_cper_is_hdr(ring, pos)) {
456 if (chunk >= sizeof(chdr)) {
457 memcpy(&chdr, &ring->ring[pos], sizeof(chdr));
458 } else {
459 memcpy(&chdr, &ring->ring[pos], chunk);
460 memcpy((u8 *)&chdr + chunk, ring->ring, sizeof(chdr) - chunk);
461 }
462
463 rec_len = chdr.record_length;
464 goto calc;
465 }
466
467 /* ring buffer is not full, no cper data after ring->wptr */
468 if (ring->count_dw)
469 goto calc;
470
471 for (p = pos + 1; p <= ring->buf_mask; p++) {
472 if (amdgpu_cper_is_hdr(ring, p)) {
473 rec_len = (p - pos) << 2;
474 goto calc;
475 }
476 }
477
478 calc:
479 if (!rec_len)
480 return chunk;
481 else
482 return umin(rec_len, chunk);
483 }
484
amdgpu_cper_ring_write(struct amdgpu_ring * ring,void * src,int count)485 void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
486 {
487 u64 pos, wptr_old, rptr;
488 int rec_cnt_dw = count >> 2;
489 u32 chunk, ent_sz;
490 u8 *s = (u8 *)src;
491
492 if (count >= ring->ring_size - 4) {
493 dev_err(ring->adev->dev,
494 "CPER data size(%d) is larger than ring size(%d)\n",
495 count, ring->ring_size - 4);
496
497 return;
498 }
499
500 mutex_lock(&ring->adev->cper.ring_lock);
501
502 wptr_old = ring->wptr;
503 rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
504
505 while (count) {
506 ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
507 chunk = umin(ent_sz, count);
508
509 memcpy(&ring->ring[ring->wptr], s, chunk);
510
511 ring->wptr += (chunk >> 2);
512 ring->wptr &= ring->ptr_mask;
513 count -= chunk;
514 s += chunk;
515 }
516
517 if (ring->count_dw < rec_cnt_dw)
518 ring->count_dw = 0;
519
520 /* the buffer is overflow, adjust rptr */
521 if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
522 ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
523 ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {
524 pos = (ring->wptr + 1) & ring->ptr_mask;
525
526 do {
527 ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
528
529 rptr += (ent_sz >> 2);
530 rptr &= ring->ptr_mask;
531 *ring->rptr_cpu_addr = rptr;
532
533 pos = rptr;
534 } while (!amdgpu_cper_is_hdr(ring, rptr));
535 }
536
537 if (ring->count_dw >= rec_cnt_dw)
538 ring->count_dw -= rec_cnt_dw;
539 mutex_unlock(&ring->adev->cper.ring_lock);
540 }
541
amdgpu_cper_ring_get_rptr(struct amdgpu_ring * ring)542 static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
543 {
544 return *(ring->rptr_cpu_addr);
545 }
546
amdgpu_cper_ring_get_wptr(struct amdgpu_ring * ring)547 static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)
548 {
549 return ring->wptr;
550 }
551
552 static const struct amdgpu_ring_funcs cper_ring_funcs = {
553 .type = AMDGPU_RING_TYPE_CPER,
554 .align_mask = 0xff,
555 .support_64bit_ptrs = false,
556 .get_rptr = amdgpu_cper_ring_get_rptr,
557 .get_wptr = amdgpu_cper_ring_get_wptr,
558 };
559
amdgpu_cper_ring_init(struct amdgpu_device * adev)560 static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
561 {
562 struct amdgpu_ring *ring = &(adev->cper.ring_buf);
563
564 mutex_init(&adev->cper.ring_lock);
565
566 ring->adev = NULL;
567 ring->ring_obj = NULL;
568 ring->use_doorbell = false;
569 ring->no_scheduler = true;
570 ring->funcs = &cper_ring_funcs;
571
572 sprintf(ring->name, "cper");
573 return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,
574 AMDGPU_RING_PRIO_DEFAULT, NULL);
575 }
576
amdgpu_cper_init(struct amdgpu_device * adev)577 int amdgpu_cper_init(struct amdgpu_device *adev)
578 {
579 int r;
580
581 if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_cper_en(adev))
582 return 0;
583 else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev) &&
584 !amdgpu_aca_is_enabled(adev))
585 return 0;
586
587 r = amdgpu_cper_ring_init(adev);
588 if (r) {
589 dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);
590 return r;
591 }
592
593 mutex_init(&adev->cper.cper_lock);
594
595 adev->cper.enabled = true;
596 adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
597
598 return 0;
599 }
600
amdgpu_cper_fini(struct amdgpu_device * adev)601 int amdgpu_cper_fini(struct amdgpu_device *adev)
602 {
603 if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
604 return 0;
605
606 adev->cper.enabled = false;
607
608 amdgpu_ring_fini(&(adev->cper.ring_buf));
609 adev->cper.count = 0;
610 adev->cper.wptr = 0;
611
612 return 0;
613 }
614