1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26
27 static const guid_t MCE = CPER_NOTIFY_MCE;
28 static const guid_t CMC = CPER_NOTIFY_CMC;
29 static const guid_t BOOT = BOOT_TYPE;
30
31 static const guid_t CRASHDUMP = AMD_CRASHDUMP;
32 static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR;
33
__inc_entry_length(struct cper_hdr * hdr,uint32_t size)34 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
35 {
36 hdr->record_length += size;
37 }
38
amdgpu_cper_get_timestamp(struct cper_timestamp * timestamp)39 static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)
40 {
41 struct tm tm;
42 time64_t now = ktime_get_real_seconds();
43
44 time64_to_tm(now, 0, &tm);
45 timestamp->seconds = tm.tm_sec;
46 timestamp->minutes = tm.tm_min;
47 timestamp->hours = tm.tm_hour;
48 timestamp->flag = 0;
49 timestamp->day = tm.tm_mday;
50 timestamp->month = 1 + tm.tm_mon;
51 timestamp->year = (1900 + tm.tm_year) % 100;
52 timestamp->century = (1900 + tm.tm_year) / 100;
53 }
54
amdgpu_cper_entry_fill_hdr(struct amdgpu_device * adev,struct cper_hdr * hdr,enum amdgpu_cper_type type,enum cper_error_severity sev)55 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
56 struct cper_hdr *hdr,
57 enum amdgpu_cper_type type,
58 enum cper_error_severity sev)
59 {
60 char record_id[16];
61
62 hdr->signature[0] = 'C';
63 hdr->signature[1] = 'P';
64 hdr->signature[2] = 'E';
65 hdr->signature[3] = 'R';
66 hdr->revision = CPER_HDR_REV_1;
67 hdr->signature_end = 0xFFFFFFFF;
68 hdr->error_severity = sev;
69
70 hdr->valid_bits.platform_id = 1;
71 hdr->valid_bits.timestamp = 1;
72
73 amdgpu_cper_get_timestamp(&hdr->timestamp);
74
75 snprintf(record_id, 9, "%d:%X",
76 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
77 adev->smuio.funcs->get_socket_id(adev) :
78 0,
79 atomic_inc_return(&adev->cper.unique_id));
80 memcpy(hdr->record_id, record_id, 8);
81
82 snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
83 adev->pdev->vendor, adev->pdev->device);
84 /* pmfw version should be part of creator_id according to CPER spec */
85 snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
86
87 switch (type) {
88 case AMDGPU_CPER_TYPE_BOOT:
89 hdr->notify_type = BOOT;
90 break;
91 case AMDGPU_CPER_TYPE_FATAL:
92 case AMDGPU_CPER_TYPE_BP_THRESHOLD:
93 hdr->notify_type = MCE;
94 break;
95 case AMDGPU_CPER_TYPE_RUNTIME:
96 if (sev == CPER_SEV_NON_FATAL_CORRECTED)
97 hdr->notify_type = CMC;
98 else
99 hdr->notify_type = MCE;
100 break;
101 default:
102 dev_err(adev->dev, "Unknown CPER Type\n");
103 break;
104 }
105
106 __inc_entry_length(hdr, HDR_LEN);
107 }
108
amdgpu_cper_entry_fill_section_desc(struct amdgpu_device * adev,struct cper_sec_desc * section_desc,bool bp_threshold,bool poison,enum cper_error_severity sev,guid_t sec_type,uint32_t section_length,uint32_t section_offset)109 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
110 struct cper_sec_desc *section_desc,
111 bool bp_threshold,
112 bool poison,
113 enum cper_error_severity sev,
114 guid_t sec_type,
115 uint32_t section_length,
116 uint32_t section_offset)
117 {
118 section_desc->revision_minor = CPER_SEC_MINOR_REV_1;
119 section_desc->revision_major = CPER_SEC_MAJOR_REV_22;
120 section_desc->sec_offset = section_offset;
121 section_desc->sec_length = section_length;
122 section_desc->valid_bits.fru_text = 1;
123 section_desc->flag_bits.primary = 1;
124 section_desc->severity = sev;
125 section_desc->sec_type = sec_type;
126
127 snprintf(section_desc->fru_text, 20, "OAM%d",
128 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
129 adev->smuio.funcs->get_socket_id(adev) :
130 0);
131
132 if (bp_threshold)
133 section_desc->flag_bits.exceed_err_threshold = 1;
134 if (poison)
135 section_desc->flag_bits.latent_err = 1;
136
137 return 0;
138 }
139
amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,struct cper_sec_crashdump_reg_data reg_data)140 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
141 struct cper_hdr *hdr,
142 uint32_t idx,
143 struct cper_sec_crashdump_reg_data reg_data)
144 {
145 struct cper_sec_desc *section_desc;
146 struct cper_sec_crashdump_fatal *section;
147
148 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
149 section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
150 FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
151
152 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
153 CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
154 FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
155
156 section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
157 section->body.reg_arr_size = sizeof(reg_data);
158 section->body.data = reg_data;
159
160 __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
161
162 return 0;
163 }
164
amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx,enum cper_error_severity sev,uint32_t * reg_dump,uint32_t reg_count)165 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
166 struct cper_hdr *hdr,
167 uint32_t idx,
168 enum cper_error_severity sev,
169 uint32_t *reg_dump,
170 uint32_t reg_count)
171 {
172 struct cper_sec_desc *section_desc;
173 struct cper_sec_nonstd_err *section;
174 bool poison;
175
176 poison = sev != CPER_SEV_NON_FATAL_CORRECTED;
177 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
178 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
179 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
180
181 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
182 sev, RUNTIME, NONSTD_SEC_LEN,
183 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
184
185 reg_count = umin(reg_count, CPER_ACA_REG_COUNT);
186
187 section->hdr.valid_bits.err_info_cnt = 1;
188 section->hdr.valid_bits.err_context_cnt = 1;
189
190 section->info.error_type = RUNTIME;
191 section->info.ms_chk_bits.err_type_valid = 1;
192 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
193 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
194
195 memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
196
197 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
198
199 return 0;
200 }
201
amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device * adev,struct cper_hdr * hdr,uint32_t idx)202 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
203 struct cper_hdr *hdr,
204 uint32_t idx)
205 {
206 struct cper_sec_desc *section_desc;
207 struct cper_sec_nonstd_err *section;
208 uint32_t socket_id;
209
210 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
211 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
212 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
213
214 amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
215 CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
216 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
217
218 section->hdr.valid_bits.err_info_cnt = 1;
219 section->hdr.valid_bits.err_context_cnt = 1;
220
221 section->info.error_type = RUNTIME;
222 section->info.valid_bits.ms_chk = 1;
223 section->info.ms_chk_bits.err_type_valid = 1;
224 section->info.ms_chk_bits.err_type = 1;
225 section->info.ms_chk_bits.pcc = 1;
226 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
227 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
228
229 /* Hardcoded Reg dump for bad page threshold CPER */
230 socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
231 adev->smuio.funcs->get_socket_id(adev) :
232 0;
233 section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1;
234 section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0;
235 section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
236 section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
237 section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0;
238 section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0;
239 section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0;
240 section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0;
241 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
242 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
243 section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = (socket_id / 4) & 0x01;
244 section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x096 | (((socket_id % 4) & 0x3) << 12);
245 section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0;
246 section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0;
247
248 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
249
250 return 0;
251 }
252
amdgpu_cper_alloc_entry(struct amdgpu_device * adev,enum amdgpu_cper_type type,uint16_t section_count)253 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
254 enum amdgpu_cper_type type,
255 uint16_t section_count)
256 {
257 struct cper_hdr *hdr;
258 uint32_t size = 0;
259
260 size += HDR_LEN;
261 size += (SEC_DESC_LEN * section_count);
262
263 switch (type) {
264 case AMDGPU_CPER_TYPE_RUNTIME:
265 case AMDGPU_CPER_TYPE_BP_THRESHOLD:
266 size += (NONSTD_SEC_LEN * section_count);
267 break;
268 case AMDGPU_CPER_TYPE_FATAL:
269 size += (FATAL_SEC_LEN * section_count);
270 break;
271 case AMDGPU_CPER_TYPE_BOOT:
272 size += (BOOT_SEC_LEN * section_count);
273 break;
274 default:
275 dev_err(adev->dev, "Unknown CPER Type!\n");
276 return NULL;
277 }
278
279 hdr = kzalloc(size, GFP_KERNEL);
280 if (!hdr)
281 return NULL;
282
283 /* Save this early */
284 hdr->sec_cnt = section_count;
285
286 return hdr;
287 }
288
amdgpu_cper_generate_ue_record(struct amdgpu_device * adev,struct aca_bank * bank)289 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
290 struct aca_bank *bank)
291 {
292 struct cper_hdr *fatal = NULL;
293 struct cper_sec_crashdump_reg_data reg_data = { 0 };
294 struct amdgpu_ring *ring = &adev->cper.ring_buf;
295 int ret;
296
297 fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
298 if (!fatal) {
299 dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
300 return -ENOMEM;
301 }
302
303 reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
304 reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
305 reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
306 reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
307 reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
308 reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
309 reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
310 reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
311
312 amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
313 ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
314 if (ret)
315 return ret;
316
317 amdgpu_cper_ring_write(ring, fatal, fatal->record_length);
318 kfree(fatal);
319
320 return 0;
321 }
322
amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device * adev)323 int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)
324 {
325 struct cper_hdr *bp_threshold = NULL;
326 struct amdgpu_ring *ring = &adev->cper.ring_buf;
327 int ret;
328
329 bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);
330 if (!bp_threshold) {
331 dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");
332 return -ENOMEM;
333 }
334
335 amdgpu_cper_entry_fill_hdr(adev, bp_threshold,
336 AMDGPU_CPER_TYPE_BP_THRESHOLD,
337 CPER_SEV_FATAL);
338 ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);
339 if (ret)
340 return ret;
341
342 amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);
343 kfree(bp_threshold);
344
345 return 0;
346 }
347
amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device * adev,enum aca_error_type aca_err_type)348 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
349 enum aca_error_type aca_err_type)
350 {
351 switch (aca_err_type) {
352 case ACA_ERROR_TYPE_UE:
353 return CPER_SEV_FATAL;
354 case ACA_ERROR_TYPE_CE:
355 return CPER_SEV_NON_FATAL_CORRECTED;
356 case ACA_ERROR_TYPE_DEFERRED:
357 return CPER_SEV_NON_FATAL_UNCORRECTED;
358 default:
359 dev_err(adev->dev, "Unknown ACA error type!\n");
360 return CPER_SEV_FATAL;
361 }
362 }
363
amdgpu_cper_generate_ce_records(struct amdgpu_device * adev,struct aca_banks * banks,uint16_t bank_count)364 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
365 struct aca_banks *banks,
366 uint16_t bank_count)
367 {
368 struct cper_hdr *corrected = NULL;
369 enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
370 struct amdgpu_ring *ring = &adev->cper.ring_buf;
371 uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
372 struct aca_bank_node *node;
373 struct aca_bank *bank;
374 uint32_t i = 0;
375 int ret;
376
377 corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
378 if (!corrected) {
379 dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
380 return -ENOMEM;
381 }
382
383 /* Raise severity if any DE is detected in the ACA bank list */
384 list_for_each_entry(node, &banks->list, node) {
385 bank = &node->bank;
386 if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
387 sev = CPER_SEV_NON_FATAL_UNCORRECTED;
388 break;
389 }
390 }
391
392 amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
393
394 /* Combine CE and DE in cper record */
395 list_for_each_entry(node, &banks->list, node) {
396 bank = &node->bank;
397 reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
398 reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
399 reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
400 reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
401 reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
402 reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
403 reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
404 reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
405 reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
406 reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
407 reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
408 reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
409 reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
410 reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
411
412 ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
413 amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
414 reg_data, CPER_ACA_REG_COUNT);
415 if (ret)
416 return ret;
417 }
418
419 amdgpu_cper_ring_write(ring, corrected, corrected->record_length);
420 kfree(corrected);
421
422 return 0;
423 }
424
amdgpu_cper_is_hdr(struct amdgpu_ring * ring,u64 pos)425 static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)
426 {
427 struct cper_hdr *chdr;
428
429 chdr = (struct cper_hdr *)&(ring->ring[pos]);
430 return strcmp(chdr->signature, "CPER") ? false : true;
431 }
432
amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring * ring,u64 pos)433 static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
434 {
435 struct cper_hdr *chdr;
436 u64 p;
437 u32 chunk, rec_len = 0;
438
439 chdr = (struct cper_hdr *)&(ring->ring[pos]);
440 chunk = ring->ring_size - (pos << 2);
441
442 if (!strcmp(chdr->signature, "CPER")) {
443 rec_len = chdr->record_length;
444 goto calc;
445 }
446
447 /* ring buffer is not full, no cper data after ring->wptr */
448 if (ring->count_dw)
449 goto calc;
450
451 for (p = pos + 1; p <= ring->buf_mask; p++) {
452 chdr = (struct cper_hdr *)&(ring->ring[p]);
453 if (!strcmp(chdr->signature, "CPER")) {
454 rec_len = (p - pos) << 2;
455 goto calc;
456 }
457 }
458
459 calc:
460 if (!rec_len)
461 return chunk;
462 else
463 return umin(rec_len, chunk);
464 }
465
amdgpu_cper_ring_write(struct amdgpu_ring * ring,void * src,int count)466 void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
467 {
468 u64 pos, wptr_old, rptr;
469 int rec_cnt_dw = count >> 2;
470 u32 chunk, ent_sz;
471 u8 *s = (u8 *)src;
472
473 if (count >= ring->ring_size - 4) {
474 dev_err(ring->adev->dev,
475 "CPER data size(%d) is larger than ring size(%d)\n",
476 count, ring->ring_size - 4);
477
478 return;
479 }
480
481 mutex_lock(&ring->adev->cper.ring_lock);
482
483 wptr_old = ring->wptr;
484 rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
485
486 while (count) {
487 ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);
488 chunk = umin(ent_sz, count);
489
490 memcpy(&ring->ring[ring->wptr], s, chunk);
491
492 ring->wptr += (chunk >> 2);
493 ring->wptr &= ring->ptr_mask;
494 count -= chunk;
495 s += chunk;
496 }
497
498 if (ring->count_dw < rec_cnt_dw)
499 ring->count_dw = 0;
500
501 /* the buffer is overflow, adjust rptr */
502 if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
503 ((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
504 ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {
505 pos = (ring->wptr + 1) & ring->ptr_mask;
506
507 do {
508 ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);
509
510 rptr += (ent_sz >> 2);
511 rptr &= ring->ptr_mask;
512 *ring->rptr_cpu_addr = rptr;
513
514 pos = rptr;
515 } while (!amdgpu_cper_is_hdr(ring, rptr));
516 }
517
518 if (ring->count_dw >= rec_cnt_dw)
519 ring->count_dw -= rec_cnt_dw;
520 mutex_unlock(&ring->adev->cper.ring_lock);
521 }
522
amdgpu_cper_ring_get_rptr(struct amdgpu_ring * ring)523 static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
524 {
525 return *(ring->rptr_cpu_addr);
526 }
527
amdgpu_cper_ring_get_wptr(struct amdgpu_ring * ring)528 static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)
529 {
530 return ring->wptr;
531 }
532
533 static const struct amdgpu_ring_funcs cper_ring_funcs = {
534 .type = AMDGPU_RING_TYPE_CPER,
535 .align_mask = 0xff,
536 .support_64bit_ptrs = false,
537 .get_rptr = amdgpu_cper_ring_get_rptr,
538 .get_wptr = amdgpu_cper_ring_get_wptr,
539 };
540
amdgpu_cper_ring_init(struct amdgpu_device * adev)541 static int amdgpu_cper_ring_init(struct amdgpu_device *adev)
542 {
543 struct amdgpu_ring *ring = &(adev->cper.ring_buf);
544
545 mutex_init(&adev->cper.ring_lock);
546
547 ring->adev = NULL;
548 ring->ring_obj = NULL;
549 ring->use_doorbell = false;
550 ring->no_scheduler = true;
551 ring->funcs = &cper_ring_funcs;
552
553 sprintf(ring->name, "cper");
554 return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,
555 AMDGPU_RING_PRIO_DEFAULT, NULL);
556 }
557
amdgpu_cper_init(struct amdgpu_device * adev)558 int amdgpu_cper_init(struct amdgpu_device *adev)
559 {
560 int r;
561
562 if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
563 return 0;
564
565 r = amdgpu_cper_ring_init(adev);
566 if (r) {
567 dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);
568 return r;
569 }
570
571 mutex_init(&adev->cper.cper_lock);
572
573 adev->cper.enabled = true;
574 adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
575
576 return 0;
577 }
578
amdgpu_cper_fini(struct amdgpu_device * adev)579 int amdgpu_cper_fini(struct amdgpu_device *adev)
580 {
581 if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))
582 return 0;
583
584 adev->cper.enabled = false;
585
586 amdgpu_ring_fini(&(adev->cper.ring_buf));
587 adev->cper.count = 0;
588 adev->cper.wptr = 0;
589
590 return 0;
591 }
592