1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include <linux/list.h> 25 #include "amdgpu.h" 26 #include "amdgpu_ras_mgr.h" 27 28 static const guid_t MCE = CPER_NOTIFY_MCE; 29 static const guid_t CMC = CPER_NOTIFY_CMC; 30 static const guid_t BOOT = BOOT_TYPE; 31 32 static const guid_t CRASHDUMP = AMD_CRASHDUMP; 33 static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR; 34 35 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size) 36 { 37 hdr->record_length += size; 38 } 39 40 static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp) 41 { 42 struct tm tm; 43 time64_t now = ktime_get_real_seconds(); 44 45 time64_to_tm(now, 0, &tm); 46 timestamp->seconds = tm.tm_sec; 47 timestamp->minutes = tm.tm_min; 48 timestamp->hours = tm.tm_hour; 49 timestamp->flag = 0; 50 timestamp->day = tm.tm_mday; 51 timestamp->month = 1 + tm.tm_mon; 52 timestamp->year = (1900 + tm.tm_year) % 100; 53 timestamp->century = (1900 + tm.tm_year) / 100; 54 } 55 56 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, 57 struct cper_hdr *hdr, 58 enum amdgpu_cper_type type, 59 enum cper_error_severity sev) 60 { 61 char record_id[16]; 62 63 hdr->signature[0] = 'C'; 64 hdr->signature[1] = 'P'; 65 hdr->signature[2] = 'E'; 66 hdr->signature[3] = 'R'; 67 hdr->revision = CPER_HDR_REV_1; 68 hdr->signature_end = 0xFFFFFFFF; 69 hdr->error_severity = sev; 70 71 hdr->valid_bits.platform_id = 1; 72 hdr->valid_bits.timestamp = 1; 73 74 amdgpu_cper_get_timestamp(&hdr->timestamp); 75 76 snprintf(record_id, 9, "%d:%X", 77 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? 78 adev->smuio.funcs->get_socket_id(adev) : 79 0, 80 atomic_inc_return(&adev->cper.unique_id)); 81 memcpy(hdr->record_id, record_id, 8); 82 83 snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", 84 adev->pdev->vendor, adev->pdev->device); 85 /* pmfw version should be part of creator_id according to CPER spec */ 86 snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU); 87 88 switch (type) { 89 case AMDGPU_CPER_TYPE_BOOT: 90 hdr->notify_type = BOOT; 91 break; 92 case AMDGPU_CPER_TYPE_FATAL: 93 case AMDGPU_CPER_TYPE_BP_THRESHOLD: 94 hdr->notify_type = MCE; 95 break; 96 case AMDGPU_CPER_TYPE_RUNTIME: 97 if (sev == CPER_SEV_NON_FATAL_CORRECTED) 98 hdr->notify_type = CMC; 99 else 100 hdr->notify_type = MCE; 101 break; 102 default: 103 dev_err(adev->dev, "Unknown CPER Type\n"); 104 break; 105 } 106 107 __inc_entry_length(hdr, HDR_LEN); 108 } 109 110 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev, 111 struct cper_sec_desc *section_desc, 112 bool bp_threshold, 113 bool poison, 114 enum cper_error_severity sev, 115 guid_t sec_type, 116 uint32_t section_length, 117 uint32_t section_offset) 118 { 119 section_desc->revision_minor = CPER_SEC_MINOR_REV_1; 120 section_desc->revision_major = CPER_SEC_MAJOR_REV_22; 121 section_desc->sec_offset = section_offset; 122 section_desc->sec_length = section_length; 123 section_desc->valid_bits.fru_text = 1; 124 section_desc->flag_bits.primary = 1; 125 section_desc->severity = sev; 126 section_desc->sec_type = sec_type; 127 128 snprintf(section_desc->fru_text, 20, "OAM%d", 129 (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? 130 adev->smuio.funcs->get_socket_id(adev) : 131 0); 132 133 if (bp_threshold) 134 section_desc->flag_bits.exceed_err_threshold = 1; 135 if (poison) 136 section_desc->flag_bits.latent_err = 1; 137 138 return 0; 139 } 140 141 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev, 142 struct cper_hdr *hdr, 143 uint32_t idx, 144 struct cper_sec_crashdump_reg_data reg_data) 145 { 146 struct cper_sec_desc *section_desc; 147 struct cper_sec_crashdump_fatal *section; 148 149 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 150 section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr + 151 FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); 152 153 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false, 154 CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN, 155 FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); 156 157 section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH; 158 section->body.reg_arr_size = sizeof(reg_data); 159 section->body.data = reg_data; 160 161 __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN); 162 163 return 0; 164 } 165 166 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev, 167 struct cper_hdr *hdr, 168 uint32_t idx, 169 enum cper_error_severity sev, 170 uint32_t *reg_dump, 171 uint32_t reg_count) 172 { 173 struct cper_sec_desc *section_desc; 174 struct cper_sec_nonstd_err *section; 175 bool poison; 176 177 poison = sev != CPER_SEV_NON_FATAL_CORRECTED; 178 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 179 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + 180 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 181 182 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison, 183 sev, RUNTIME, NONSTD_SEC_LEN, 184 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 185 186 reg_count = umin(reg_count, CPER_ACA_REG_COUNT); 187 188 section->hdr.valid_bits.err_info_cnt = 1; 189 section->hdr.valid_bits.err_context_cnt = 1; 190 191 section->info.error_type = RUNTIME; 192 section->info.ms_chk_bits.err_type_valid = 1; 193 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; 194 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); 195 196 memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t)); 197 198 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); 199 200 return 0; 201 } 202 203 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev, 204 struct cper_hdr *hdr, 205 uint32_t idx) 206 { 207 struct cper_sec_desc *section_desc; 208 struct cper_sec_nonstd_err *section; 209 uint32_t socket_id; 210 211 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 212 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + 213 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 214 215 amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false, 216 CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN, 217 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 218 219 section->hdr.valid_bits.err_info_cnt = 1; 220 section->hdr.valid_bits.err_context_cnt = 1; 221 222 section->info.error_type = RUNTIME; 223 section->info.valid_bits.ms_chk = 1; 224 section->info.ms_chk_bits.err_type_valid = 1; 225 section->info.ms_chk_bits.err_type = 1; 226 section->info.ms_chk_bits.pcc = 1; 227 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; 228 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); 229 230 /* Hardcoded Reg dump for bad page threshold CPER */ 231 socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? 232 adev->smuio.funcs->get_socket_id(adev) : 233 0; 234 section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1; 235 section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0; 236 section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137; 237 section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000; 238 section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0; 239 section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0; 240 section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0; 241 section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0; 242 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2; 243 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff; 244 section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = (socket_id / 4) & 0x01; 245 section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x096 | (((socket_id % 4) & 0x3) << 12); 246 section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0; 247 section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0; 248 249 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); 250 251 return 0; 252 } 253 254 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, 255 enum amdgpu_cper_type type, 256 uint16_t section_count) 257 { 258 struct cper_hdr *hdr; 259 uint32_t size = 0; 260 261 size += HDR_LEN; 262 size += (SEC_DESC_LEN * section_count); 263 264 switch (type) { 265 case AMDGPU_CPER_TYPE_RUNTIME: 266 case AMDGPU_CPER_TYPE_BP_THRESHOLD: 267 size += (NONSTD_SEC_LEN * section_count); 268 break; 269 case AMDGPU_CPER_TYPE_FATAL: 270 size += (FATAL_SEC_LEN * section_count); 271 break; 272 case AMDGPU_CPER_TYPE_BOOT: 273 size += (BOOT_SEC_LEN * section_count); 274 break; 275 default: 276 dev_err(adev->dev, "Unknown CPER Type!\n"); 277 return NULL; 278 } 279 280 hdr = kzalloc(size, GFP_KERNEL); 281 if (!hdr) 282 return NULL; 283 284 /* Save this early */ 285 hdr->sec_cnt = section_count; 286 287 return hdr; 288 } 289 290 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, 291 struct aca_bank *bank) 292 { 293 struct cper_hdr *fatal = NULL; 294 struct cper_sec_crashdump_reg_data reg_data = { 0 }; 295 struct amdgpu_ring *ring = &adev->cper.ring_buf; 296 int ret; 297 298 fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1); 299 if (!fatal) { 300 dev_err(adev->dev, "fail to alloc cper entry for ue record\n"); 301 return -ENOMEM; 302 } 303 304 reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 305 reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 306 reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 307 reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 308 reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); 309 reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); 310 reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); 311 reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); 312 313 amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL); 314 ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data); 315 if (ret) 316 return ret; 317 318 amdgpu_cper_ring_write(ring, fatal, fatal->record_length); 319 kfree(fatal); 320 321 return 0; 322 } 323 324 int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev) 325 { 326 struct cper_hdr *bp_threshold = NULL; 327 struct amdgpu_ring *ring = &adev->cper.ring_buf; 328 int ret; 329 330 bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1); 331 if (!bp_threshold) { 332 dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n"); 333 return -ENOMEM; 334 } 335 336 amdgpu_cper_entry_fill_hdr(adev, bp_threshold, 337 AMDGPU_CPER_TYPE_BP_THRESHOLD, 338 CPER_SEV_FATAL); 339 ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0); 340 if (ret) 341 return ret; 342 343 amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length); 344 kfree(bp_threshold); 345 346 return 0; 347 } 348 349 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev, 350 enum aca_error_type aca_err_type) 351 { 352 switch (aca_err_type) { 353 case ACA_ERROR_TYPE_UE: 354 return CPER_SEV_FATAL; 355 case ACA_ERROR_TYPE_CE: 356 return CPER_SEV_NON_FATAL_CORRECTED; 357 case ACA_ERROR_TYPE_DEFERRED: 358 return CPER_SEV_NON_FATAL_UNCORRECTED; 359 default: 360 dev_err(adev->dev, "Unknown ACA error type!\n"); 361 return CPER_SEV_FATAL; 362 } 363 } 364 365 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, 366 struct aca_banks *banks, 367 uint16_t bank_count) 368 { 369 struct cper_hdr *corrected = NULL; 370 enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED; 371 struct amdgpu_ring *ring = &adev->cper.ring_buf; 372 uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 }; 373 struct aca_bank_node *node; 374 struct aca_bank *bank; 375 uint32_t i = 0; 376 int ret; 377 378 corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count); 379 if (!corrected) { 380 dev_err(adev->dev, "fail to allocate cper entry for ce records\n"); 381 return -ENOMEM; 382 } 383 384 /* Raise severity if any DE is detected in the ACA bank list */ 385 list_for_each_entry(node, &banks->list, node) { 386 bank = &node->bank; 387 if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { 388 sev = CPER_SEV_NON_FATAL_UNCORRECTED; 389 break; 390 } 391 } 392 393 amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev); 394 395 /* Combine CE and DE in cper record */ 396 list_for_each_entry(node, &banks->list, node) { 397 bank = &node->bank; 398 reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]); 399 reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]); 400 reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 401 reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 402 reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 403 reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 404 reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]); 405 reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]); 406 reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); 407 reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); 408 reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); 409 reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); 410 reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); 411 reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); 412 413 ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++, 414 amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type), 415 reg_data, CPER_ACA_REG_COUNT); 416 if (ret) 417 return ret; 418 } 419 420 amdgpu_cper_ring_write(ring, corrected, corrected->record_length); 421 kfree(corrected); 422 423 return 0; 424 } 425 426 static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos) 427 { 428 struct cper_hdr *chdr; 429 430 chdr = (struct cper_hdr *)&(ring->ring[pos]); 431 return strcmp(chdr->signature, "CPER") ? false : true; 432 } 433 434 static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos) 435 { 436 struct cper_hdr *chdr; 437 u64 p; 438 u32 chunk, rec_len = 0; 439 440 chdr = (struct cper_hdr *)&(ring->ring[pos]); 441 chunk = ring->ring_size - (pos << 2); 442 443 if (!strcmp(chdr->signature, "CPER")) { 444 rec_len = chdr->record_length; 445 goto calc; 446 } 447 448 /* ring buffer is not full, no cper data after ring->wptr */ 449 if (ring->count_dw) 450 goto calc; 451 452 for (p = pos + 1; p <= ring->buf_mask; p++) { 453 chdr = (struct cper_hdr *)&(ring->ring[p]); 454 if (!strcmp(chdr->signature, "CPER")) { 455 rec_len = (p - pos) << 2; 456 goto calc; 457 } 458 } 459 460 calc: 461 if (!rec_len) 462 return chunk; 463 else 464 return umin(rec_len, chunk); 465 } 466 467 void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count) 468 { 469 u64 pos, wptr_old, rptr; 470 int rec_cnt_dw = count >> 2; 471 u32 chunk, ent_sz; 472 u8 *s = (u8 *)src; 473 474 if (count >= ring->ring_size - 4) { 475 dev_err(ring->adev->dev, 476 "CPER data size(%d) is larger than ring size(%d)\n", 477 count, ring->ring_size - 4); 478 479 return; 480 } 481 482 mutex_lock(&ring->adev->cper.ring_lock); 483 484 wptr_old = ring->wptr; 485 rptr = *ring->rptr_cpu_addr & ring->ptr_mask; 486 487 while (count) { 488 ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr); 489 chunk = umin(ent_sz, count); 490 491 memcpy(&ring->ring[ring->wptr], s, chunk); 492 493 ring->wptr += (chunk >> 2); 494 ring->wptr &= ring->ptr_mask; 495 count -= chunk; 496 s += chunk; 497 } 498 499 if (ring->count_dw < rec_cnt_dw) 500 ring->count_dw = 0; 501 502 /* the buffer is overflow, adjust rptr */ 503 if (((wptr_old < rptr) && (rptr <= ring->wptr)) || 504 ((ring->wptr < wptr_old) && (wptr_old < rptr)) || 505 ((rptr <= ring->wptr) && (ring->wptr < wptr_old))) { 506 pos = (ring->wptr + 1) & ring->ptr_mask; 507 508 do { 509 ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos); 510 511 rptr += (ent_sz >> 2); 512 rptr &= ring->ptr_mask; 513 *ring->rptr_cpu_addr = rptr; 514 515 pos = rptr; 516 } while (!amdgpu_cper_is_hdr(ring, rptr)); 517 } 518 519 if (ring->count_dw >= rec_cnt_dw) 520 ring->count_dw -= rec_cnt_dw; 521 mutex_unlock(&ring->adev->cper.ring_lock); 522 } 523 524 static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring) 525 { 526 return *(ring->rptr_cpu_addr); 527 } 528 529 static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring) 530 { 531 return ring->wptr; 532 } 533 534 static const struct amdgpu_ring_funcs cper_ring_funcs = { 535 .type = AMDGPU_RING_TYPE_CPER, 536 .align_mask = 0xff, 537 .support_64bit_ptrs = false, 538 .get_rptr = amdgpu_cper_ring_get_rptr, 539 .get_wptr = amdgpu_cper_ring_get_wptr, 540 }; 541 542 static int amdgpu_cper_ring_init(struct amdgpu_device *adev) 543 { 544 struct amdgpu_ring *ring = &(adev->cper.ring_buf); 545 546 mutex_init(&adev->cper.ring_lock); 547 548 ring->adev = NULL; 549 ring->ring_obj = NULL; 550 ring->use_doorbell = false; 551 ring->no_scheduler = true; 552 ring->funcs = &cper_ring_funcs; 553 554 sprintf(ring->name, "cper"); 555 return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0, 556 AMDGPU_RING_PRIO_DEFAULT, NULL); 557 } 558 559 int amdgpu_cper_init(struct amdgpu_device *adev) 560 { 561 int r; 562 563 if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_cper_en(adev)) 564 return 0; 565 else if (!amdgpu_sriov_vf(adev) && !amdgpu_uniras_enabled(adev) && 566 !amdgpu_aca_is_enabled(adev)) 567 return 0; 568 569 r = amdgpu_cper_ring_init(adev); 570 if (r) { 571 dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r); 572 return r; 573 } 574 575 mutex_init(&adev->cper.cper_lock); 576 577 adev->cper.enabled = true; 578 adev->cper.max_count = CPER_MAX_ALLOWED_COUNT; 579 580 return 0; 581 } 582 583 int amdgpu_cper_fini(struct amdgpu_device *adev) 584 { 585 if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev)) 586 return 0; 587 588 adev->cper.enabled = false; 589 590 amdgpu_ring_fini(&(adev->cper.ring_buf)); 591 adev->cper.count = 0; 592 adev->cper.wptr = 0; 593 594 return 0; 595 } 596