1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include "ras.h" 25 #include "ras_aca.h" 26 #include "ras_aca_v1_0.h" 27 #include "ras_mp1_v13_0.h" 28 29 #define ACA_MARK_FATAL_FLAG 0x100 30 #define ACA_MARK_UE_READ_FLAG 0x1 31 32 #define blk_name(block_id) ras_core_get_ras_block_name(block_id) 33 34 static struct aca_regs_dump { 35 const char *name; 36 int reg_idx; 37 } aca_regs[] = { 38 {"CONTROL", ACA_REG_IDX__CTL}, 39 {"STATUS", ACA_REG_IDX__STATUS}, 40 {"ADDR", ACA_REG_IDX__ADDR}, 41 {"MISC", ACA_REG_IDX__MISC0}, 42 {"CONFIG", ACA_REG_IDX__CONFG}, 43 {"IPID", ACA_REG_IDX__IPID}, 44 {"SYND", ACA_REG_IDX__SYND}, 45 {"DESTAT", ACA_REG_IDX__DESTAT}, 46 {"DEADDR", ACA_REG_IDX__DEADDR}, 47 {"CONTROL_MASK", ACA_REG_IDX__CTL_MASK}, 48 }; 49 50 51 static void aca_report_ecc_info(struct ras_core_context *ras_core, 52 u64 seq_no, u32 blk, u32 skt, u32 aid, 53 struct aca_aid_ecc *aid_ecc, 54 struct aca_bank_ecc *new_ecc) 55 { 56 struct aca_ecc_count ecc_count = {0}; 57 58 ecc_count.new_ue_count = new_ecc->ue_count; 59 ecc_count.new_de_count = new_ecc->de_count; 60 ecc_count.new_ce_count = new_ecc->ce_count; 61 if (blk == RAS_BLOCK_ID__GFX) { 62 struct aca_ecc_count *xcd_ecc; 63 int xcd_id; 64 65 for (xcd_id = 0; xcd_id < aid_ecc->xcd.xcd_num; xcd_id++) { 66 xcd_ecc = &aid_ecc->xcd.xcd[xcd_id].ecc_err; 67 ecc_count.total_ue_count += xcd_ecc->total_ue_count; 68 ecc_count.total_de_count += xcd_ecc->total_de_count; 69 ecc_count.total_ce_count += xcd_ecc->total_ce_count; 70 } 71 } else { 72 ecc_count.total_ue_count = aid_ecc->ecc_err.total_ue_count; 73 ecc_count.total_de_count = aid_ecc->ecc_err.total_de_count; 74 ecc_count.total_ce_count = aid_ecc->ecc_err.total_ce_count; 75 } 76 77 if (ecc_count.new_ue_count) { 78 RAS_DEV_INFO(ras_core->dev, 79 "{%llu} socket: %d, die: %d, %u new uncorrectable hardware errors detected in %s block\n", 80 seq_no, skt, aid, ecc_count.new_ue_count, blk_name(blk)); 81 RAS_DEV_INFO(ras_core->dev, 82 "{%llu} socket: %d, die: %d, %u uncorrectable hardware errors detected in total in %s block\n", 83 seq_no, skt, aid, ecc_count.total_ue_count, blk_name(blk)); 84 } 85 86 if (ecc_count.new_de_count) { 87 RAS_DEV_INFO(ras_core->dev, 88 "{%llu} socket: %d, die: %d, %u new %s detected in %s block\n", 89 seq_no, skt, aid, ecc_count.new_de_count, 90 (blk == RAS_BLOCK_ID__UMC) ? 91 "deferred hardware errors" : "poison consumption", 92 blk_name(blk)); 93 RAS_DEV_INFO(ras_core->dev, 94 "{%llu} socket: %d, die: %d, %u %s detected in total in %s block\n", 95 seq_no, skt, aid, ecc_count.total_de_count, 96 (blk == RAS_BLOCK_ID__UMC) ? 97 "deferred hardware errors" : "poison consumption", 98 blk_name(blk)); 99 } 100 101 if (ecc_count.new_ce_count) { 102 RAS_DEV_INFO(ras_core->dev, 103 "{%llu} socket: %d, die: %d, %u new correctable hardware errors detected in %s block\n", 104 seq_no, skt, aid, ecc_count.new_ce_count, blk_name(blk)); 105 RAS_DEV_INFO(ras_core->dev, 106 "{%llu} socket: %d, die: %d, %u correctable hardware errors detected in total in %s block\n", 107 seq_no, skt, aid, ecc_count.total_ce_count, blk_name(blk)); 108 } 109 } 110 111 static void aca_bank_log(struct ras_core_context *ras_core, 112 int idx, int total, struct aca_bank_reg *bank, 113 struct aca_bank_ecc *bank_ecc) 114 { 115 int i; 116 117 RAS_DEV_INFO(ras_core->dev, 118 "{%llu}" RAS_HW_ERR "Accelerator Check Architecture events logged\n", 119 bank->seq_no); 120 /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ 121 for (i = 0; i < ARRAY_SIZE(aca_regs); i++) 122 RAS_DEV_INFO(ras_core->dev, 123 "{%llu}" RAS_HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", 124 bank->seq_no, idx + 1, total, 125 aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); 126 } 127 128 static void aca_log_bank_data(struct ras_core_context *ras_core, 129 struct aca_bank_reg *bank, struct aca_bank_ecc *bank_ecc, 130 struct ras_log_batch_tag *batch) 131 { 132 if (bank_ecc->ue_count) 133 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_UE, bank->regs, batch); 134 else if (bank_ecc->de_count) 135 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_DE, bank->regs, batch); 136 else 137 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_CE, bank->regs, batch); 138 } 139 140 static int aca_get_bank_count(struct ras_core_context *ras_core, 141 enum ras_err_type type, u32 *count) 142 { 143 return ras_mp1_get_bank_count(ras_core, type, count); 144 } 145 146 static bool aca_match_bank(struct aca_block *aca_blk, struct aca_bank_reg *bank) 147 { 148 const struct aca_bank_hw_ops *bank_ops; 149 150 if (!aca_blk->blk_info) 151 return false; 152 153 bank_ops = &aca_blk->blk_info->bank_ops; 154 if (!bank_ops->bank_match) 155 return false; 156 157 return bank_ops->bank_match(aca_blk, bank); 158 } 159 160 static int aca_parse_bank(struct ras_core_context *ras_core, 161 struct aca_block *aca_blk, 162 struct aca_bank_reg *bank, 163 struct aca_bank_ecc *ecc) 164 { 165 const struct aca_bank_hw_ops *bank_ops = &aca_blk->blk_info->bank_ops; 166 167 if (!bank_ops || !bank_ops->bank_parse) 168 return -RAS_CORE_NOT_SUPPORTED; 169 170 return bank_ops->bank_parse(ras_core, aca_blk, bank, ecc); 171 } 172 173 static int aca_check_block_ecc_info(struct ras_core_context *ras_core, 174 struct aca_block *aca_blk, struct aca_ecc_info *info) 175 { 176 if (info->socket_id >= aca_blk->ecc.socket_num_per_hive) { 177 RAS_DEV_ERR(ras_core->dev, 178 "Socket id (%d) is out of config! max:%u\n", 179 info->socket_id, aca_blk->ecc.socket_num_per_hive); 180 return -ENODATA; 181 } 182 183 if (info->die_id >= aca_blk->ecc.socket[info->socket_id].aid_num) { 184 RAS_DEV_ERR(ras_core->dev, 185 "Die id (%d) is out of config! max:%u\n", 186 info->die_id, aca_blk->ecc.socket[info->socket_id].aid_num); 187 return -ENODATA; 188 } 189 190 if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX) && 191 (info->xcd_id >= 192 aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num)) { 193 RAS_DEV_ERR(ras_core->dev, 194 "Xcd id (%d) is out of config! max:%u\n", 195 info->xcd_id, 196 aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num); 197 return -ENODATA; 198 } 199 200 return 0; 201 } 202 203 static int aca_log_bad_bank(struct ras_core_context *ras_core, 204 struct aca_block *aca_blk, struct aca_bank_reg *bank, 205 struct aca_bank_ecc *bank_ecc) 206 { 207 struct aca_ecc_info *info; 208 struct aca_ecc_count *ecc_err; 209 struct aca_aid_ecc *aid_ecc; 210 int ret; 211 212 info = &bank_ecc->bank_info; 213 214 ret = aca_check_block_ecc_info(ras_core, aca_blk, info); 215 if (ret) 216 return ret; 217 218 mutex_lock(&ras_core->ras_aca.aca_lock); 219 aid_ecc = &aca_blk->ecc.socket[info->socket_id].aid[info->die_id]; 220 if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX) 221 ecc_err = &aid_ecc->xcd.xcd[info->xcd_id].ecc_err; 222 else 223 ecc_err = &aid_ecc->ecc_err; 224 225 ecc_err->new_ce_count += bank_ecc->ce_count; 226 ecc_err->total_ce_count += bank_ecc->ce_count; 227 ecc_err->new_ue_count += bank_ecc->ue_count; 228 ecc_err->total_ue_count += bank_ecc->ue_count; 229 ecc_err->new_de_count += bank_ecc->de_count; 230 ecc_err->total_de_count += bank_ecc->de_count; 231 mutex_unlock(&ras_core->ras_aca.aca_lock); 232 233 if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC) && 234 bank_ecc->de_count) { 235 struct ras_bank_ecc ras_ecc = {0}; 236 237 ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); 238 ras_ecc.addr = bank_ecc->bank_info.addr; 239 ras_ecc.ipid = bank_ecc->bank_info.ipid; 240 ras_ecc.status = bank_ecc->bank_info.status; 241 ras_ecc.seq_no = bank->seq_no; 242 243 if (ras_core_gpu_in_reset(ras_core)) 244 ras_umc_log_bad_bank_pending(ras_core, &ras_ecc); 245 else 246 ras_umc_log_bad_bank(ras_core, &ras_ecc); 247 } 248 249 aca_report_ecc_info(ras_core, 250 bank->seq_no, aca_blk->blk_info->ras_block_id, info->socket_id, info->die_id, 251 &aca_blk->ecc.socket[info->socket_id].aid[info->die_id], bank_ecc); 252 253 return 0; 254 } 255 256 static struct aca_block *aca_get_bank_aca_block(struct ras_core_context *ras_core, 257 struct aca_bank_reg *bank) 258 { 259 int i = 0; 260 261 for (i = 0; i < RAS_BLOCK_ID__LAST; i++) 262 if (aca_match_bank(&ras_core->ras_aca.aca_blk[i], bank)) 263 return &ras_core->ras_aca.aca_blk[i]; 264 265 return NULL; 266 } 267 268 static int aca_dump_bank(struct ras_core_context *ras_core, u32 ecc_type, 269 int idx, void *data) 270 { 271 struct aca_bank_reg *bank = (struct aca_bank_reg *)data; 272 int i, ret, reg_cnt; 273 274 reg_cnt = min_t(int, 16, ARRAY_SIZE(bank->regs)); 275 for (i = 0; i < reg_cnt; i++) { 276 ret = ras_mp1_dump_bank(ras_core, ecc_type, idx, i, &bank->regs[i]); 277 if (ret) 278 return ret; 279 } 280 281 return 0; 282 } 283 284 static uint64_t aca_get_bank_seqno(struct ras_core_context *ras_core, 285 enum ras_err_type err_type, struct aca_block *aca_blk, 286 struct aca_bank_ecc *bank_ecc) 287 { 288 uint64_t seq_no = 0; 289 290 if (bank_ecc->de_count) { 291 if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC) 292 seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_DE, true); 293 else 294 seq_no = ras_core_get_seqno(ras_core, 295 RAS_SEQNO_TYPE_POISON_CONSUMPTION, true); 296 } else if (bank_ecc->ue_count) { 297 seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_UE, true); 298 } else { 299 seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_CE, true); 300 } 301 302 return seq_no; 303 } 304 305 static bool aca_dup_update_ue_in_fatal(struct ras_core_context *ras_core, 306 u32 ecc_type) 307 { 308 struct ras_aca *aca = &ras_core->ras_aca; 309 310 if (ecc_type != RAS_ERR_TYPE__UE) 311 return false; 312 313 if (aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) { 314 if (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG) 315 return true; 316 317 aca->ue_updated_mark |= ACA_MARK_UE_READ_FLAG; 318 } 319 320 return false; 321 } 322 323 void ras_aca_mark_fatal_flag(struct ras_core_context *ras_core) 324 { 325 struct ras_aca *aca = &ras_core->ras_aca; 326 327 if (!aca) 328 return; 329 330 aca->ue_updated_mark |= ACA_MARK_FATAL_FLAG; 331 } 332 333 void ras_aca_clear_fatal_flag(struct ras_core_context *ras_core) 334 { 335 struct ras_aca *aca = &ras_core->ras_aca; 336 337 if (!aca) 338 return; 339 340 if ((aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) && 341 (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG)) 342 aca->ue_updated_mark = 0; 343 } 344 345 static int aca_banks_update(struct ras_core_context *ras_core, 346 u32 ecc_type, void *data) 347 { 348 struct aca_bank_reg bank; 349 struct aca_block *aca_blk; 350 struct aca_bank_ecc bank_ecc; 351 struct ras_log_batch_tag *batch_tag = NULL; 352 u32 count = 0; 353 int ret = 0; 354 int i; 355 356 mutex_lock(&ras_core->ras_aca.bank_op_lock); 357 358 if (aca_dup_update_ue_in_fatal(ras_core, ecc_type)) 359 goto out; 360 361 ret = aca_get_bank_count(ras_core, ecc_type, &count); 362 if (ret) 363 goto out; 364 365 if (!count) 366 goto out; 367 368 batch_tag = ras_log_ring_create_batch_tag(ras_core); 369 for (i = 0; i < count; i++) { 370 memset(&bank, 0, sizeof(bank)); 371 ret = aca_dump_bank(ras_core, ecc_type, i, &bank); 372 if (ret) 373 break; 374 375 bank.ecc_type = ecc_type; 376 377 memset(&bank_ecc, 0, sizeof(bank_ecc)); 378 aca_blk = aca_get_bank_aca_block(ras_core, &bank); 379 if (aca_blk) 380 ret = aca_parse_bank(ras_core, aca_blk, &bank, &bank_ecc); 381 382 bank.seq_no = aca_get_bank_seqno(ras_core, ecc_type, aca_blk, &bank_ecc); 383 384 aca_log_bank_data(ras_core, &bank, &bank_ecc, batch_tag); 385 aca_bank_log(ras_core, i, count, &bank, &bank_ecc); 386 387 if (!ret && aca_blk) 388 ret = aca_log_bad_bank(ras_core, aca_blk, &bank, &bank_ecc); 389 390 if (ret) 391 break; 392 } 393 ras_log_ring_destroy_batch_tag(ras_core, batch_tag); 394 395 out: 396 mutex_unlock(&ras_core->ras_aca.bank_op_lock); 397 return ret; 398 } 399 400 int ras_aca_update_ecc(struct ras_core_context *ras_core, u32 type, void *data) 401 { 402 /* Update aca bank to aca source error_cache first */ 403 return aca_banks_update(ras_core, type, data); 404 } 405 406 static struct aca_block *ras_aca_get_block_handle(struct ras_core_context *ras_core, uint32_t blk) 407 { 408 return &ras_core->ras_aca.aca_blk[blk]; 409 } 410 411 static int ras_aca_clear_block_ecc_count(struct ras_core_context *ras_core, u32 blk) 412 { 413 struct aca_block *aca_blk; 414 struct aca_aid_ecc *aid_ecc; 415 int skt, aid, xcd; 416 417 mutex_lock(&ras_core->ras_aca.aca_lock); 418 aca_blk = ras_aca_get_block_handle(ras_core, blk); 419 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { 420 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { 421 aid_ecc = &aca_blk->ecc.socket[skt].aid[aid]; 422 if (blk == RAS_BLOCK_ID__GFX) { 423 for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++) 424 memset(&aid_ecc->xcd.xcd[xcd], 425 0, sizeof(struct aca_xcd_ecc)); 426 } else { 427 memset(&aid_ecc->ecc_err, 0, sizeof(aid_ecc->ecc_err)); 428 } 429 } 430 } 431 mutex_unlock(&ras_core->ras_aca.aca_lock); 432 433 return 0; 434 } 435 436 int ras_aca_clear_all_blocks_ecc_count(struct ras_core_context *ras_core) 437 { 438 enum ras_block_id blk; 439 int ret; 440 441 for (blk = RAS_BLOCK_ID__UMC; blk < RAS_BLOCK_ID__LAST; blk++) { 442 ret = ras_aca_clear_block_ecc_count(ras_core, blk); 443 if (ret) 444 break; 445 } 446 447 return ret; 448 } 449 450 int ras_aca_clear_block_new_ecc_count(struct ras_core_context *ras_core, u32 blk) 451 { 452 struct aca_block *aca_blk; 453 int skt, aid, xcd; 454 struct aca_ecc_count *ecc_err; 455 struct aca_aid_ecc *aid_ecc; 456 457 mutex_lock(&ras_core->ras_aca.aca_lock); 458 aca_blk = ras_aca_get_block_handle(ras_core, blk); 459 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { 460 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { 461 aid_ecc = &aca_blk->ecc.socket[skt].aid[aid]; 462 if (blk == RAS_BLOCK_ID__GFX) { 463 for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++) { 464 ecc_err = &aid_ecc->xcd.xcd[xcd].ecc_err; 465 ecc_err->new_ce_count = 0; 466 ecc_err->new_ue_count = 0; 467 ecc_err->new_de_count = 0; 468 } 469 } else { 470 ecc_err = &aid_ecc->ecc_err; 471 ecc_err->new_ce_count = 0; 472 ecc_err->new_ue_count = 0; 473 ecc_err->new_de_count = 0; 474 } 475 } 476 } 477 mutex_unlock(&ras_core->ras_aca.aca_lock); 478 479 return 0; 480 } 481 482 static int ras_aca_get_block_each_aid_ecc_count(struct ras_core_context *ras_core, 483 u32 blk, u32 skt, u32 aid, u32 xcd, 484 struct aca_ecc_count *ecc_count) 485 { 486 struct aca_block *aca_blk; 487 struct aca_ecc_count *ecc_err; 488 489 aca_blk = ras_aca_get_block_handle(ras_core, blk); 490 if (blk == RAS_BLOCK_ID__GFX) 491 ecc_err = &aca_blk->ecc.socket[skt].aid[aid].xcd.xcd[xcd].ecc_err; 492 else 493 ecc_err = &aca_blk->ecc.socket[skt].aid[aid].ecc_err; 494 495 ecc_count->new_ce_count = ecc_err->new_ce_count; 496 ecc_count->total_ce_count = ecc_err->total_ce_count; 497 ecc_count->new_ue_count = ecc_err->new_ue_count; 498 ecc_count->total_ue_count = ecc_err->total_ue_count; 499 ecc_count->new_de_count = ecc_err->new_de_count; 500 ecc_count->total_de_count = ecc_err->total_de_count; 501 502 return 0; 503 } 504 505 static inline void _add_ecc_count(struct aca_ecc_count *des, struct aca_ecc_count *src) 506 { 507 des->new_ce_count += src->new_ce_count; 508 des->total_ce_count += src->total_ce_count; 509 des->new_ue_count += src->new_ue_count; 510 des->total_ue_count += src->total_ue_count; 511 des->new_de_count += src->new_de_count; 512 des->total_de_count += src->total_de_count; 513 } 514 515 static const struct ras_aca_ip_func *aca_get_ip_func( 516 struct ras_core_context *ras_core, uint32_t ip_version) 517 { 518 switch (ip_version) { 519 case IP_VERSION(1, 0, 0): 520 return &ras_aca_func_v1_0; 521 default: 522 RAS_DEV_ERR(ras_core->dev, 523 "ACA ip version(0x%x) is not supported!\n", ip_version); 524 break; 525 } 526 527 return NULL; 528 } 529 530 int ras_aca_get_block_ecc_count(struct ras_core_context *ras_core, 531 u32 blk, void *data) 532 { 533 struct ras_ecc_count *err_data = (struct ras_ecc_count *)data; 534 struct aca_block *aca_blk; 535 int skt, aid, xcd; 536 struct aca_ecc_count ecc_xcd; 537 struct aca_ecc_count ecc_aid; 538 struct aca_ecc_count ecc; 539 540 if (blk >= RAS_BLOCK_ID__LAST) 541 return -EINVAL; 542 543 if (!err_data) 544 return -EINVAL; 545 546 aca_blk = ras_aca_get_block_handle(ras_core, blk); 547 memset(&ecc, 0, sizeof(ecc)); 548 549 mutex_lock(&ras_core->ras_aca.aca_lock); 550 if (blk == RAS_BLOCK_ID__GFX) { 551 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { 552 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { 553 memset(&ecc_aid, 0, sizeof(ecc_aid)); 554 for (xcd = 0; 555 xcd < aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num; 556 xcd++) { 557 memset(&ecc_xcd, 0, sizeof(ecc_xcd)); 558 if (ras_aca_get_block_each_aid_ecc_count(ras_core, 559 blk, skt, aid, xcd, &ecc_xcd)) 560 continue; 561 _add_ecc_count(&ecc_aid, &ecc_xcd); 562 } 563 _add_ecc_count(&ecc, &ecc_aid); 564 } 565 } 566 } else { 567 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { 568 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { 569 memset(&ecc_aid, 0, sizeof(ecc_aid)); 570 if (ras_aca_get_block_each_aid_ecc_count(ras_core, 571 blk, skt, aid, 0, &ecc_aid)) 572 continue; 573 _add_ecc_count(&ecc, &ecc_aid); 574 } 575 } 576 } 577 578 err_data->new_ce_count = ecc.new_ce_count; 579 err_data->total_ce_count = ecc.total_ce_count; 580 err_data->new_ue_count = ecc.new_ue_count; 581 err_data->total_ue_count = ecc.total_ue_count; 582 err_data->new_de_count = ecc.new_de_count; 583 err_data->total_de_count = ecc.total_de_count; 584 mutex_unlock(&ras_core->ras_aca.aca_lock); 585 586 return 0; 587 } 588 589 int ras_aca_sw_init(struct ras_core_context *ras_core) 590 { 591 struct ras_aca *ras_aca = &ras_core->ras_aca; 592 struct ras_aca_config *aca_cfg = &ras_core->config->aca_cfg; 593 struct aca_block *aca_blk; 594 uint32_t socket_num_per_hive; 595 uint32_t aid_num_per_socket; 596 uint32_t xcd_num_per_aid; 597 int blk, skt, aid; 598 599 socket_num_per_hive = aca_cfg->socket_num_per_hive; 600 aid_num_per_socket = aca_cfg->aid_num_per_socket; 601 xcd_num_per_aid = aca_cfg->xcd_num_per_aid; 602 603 if (!xcd_num_per_aid || !aid_num_per_socket || 604 (socket_num_per_hive > MAX_SOCKET_NUM_PER_HIVE) || 605 (aid_num_per_socket > MAX_AID_NUM_PER_SOCKET) || 606 (xcd_num_per_aid > MAX_XCD_NUM_PER_AID)) { 607 RAS_DEV_ERR(ras_core->dev, "Invalid ACA system configuration: %d, %d, %d\n", 608 socket_num_per_hive, aid_num_per_socket, xcd_num_per_aid); 609 return -EINVAL; 610 } 611 612 memset(ras_aca, 0, sizeof(*ras_aca)); 613 614 for (blk = 0; blk < RAS_BLOCK_ID__LAST; blk++) { 615 aca_blk = &ras_aca->aca_blk[blk]; 616 aca_blk->ecc.socket_num_per_hive = socket_num_per_hive; 617 for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { 618 aca_blk->ecc.socket[skt].aid_num = aid_num_per_socket; 619 if (blk == RAS_BLOCK_ID__GFX) { 620 for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) 621 aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num = 622 xcd_num_per_aid; 623 } 624 } 625 } 626 627 mutex_init(&ras_aca->aca_lock); 628 mutex_init(&ras_aca->bank_op_lock); 629 630 return 0; 631 } 632 633 int ras_aca_sw_fini(struct ras_core_context *ras_core) 634 { 635 struct ras_aca *ras_aca = &ras_core->ras_aca; 636 637 mutex_destroy(&ras_aca->aca_lock); 638 mutex_destroy(&ras_aca->bank_op_lock); 639 640 return 0; 641 } 642 643 int ras_aca_hw_init(struct ras_core_context *ras_core) 644 { 645 struct ras_aca *ras_aca = &ras_core->ras_aca; 646 struct aca_block *aca_blk; 647 const struct ras_aca_ip_func *ip_func; 648 int i; 649 650 ras_aca->aca_ip_version = ras_core->config->aca_ip_version; 651 ip_func = aca_get_ip_func(ras_core, ras_aca->aca_ip_version); 652 if (!ip_func) 653 return -EINVAL; 654 655 for (i = 0; i < ip_func->block_num; i++) { 656 aca_blk = &ras_aca->aca_blk[ip_func->block_info[i]->ras_block_id]; 657 aca_blk->blk_info = ip_func->block_info[i]; 658 } 659 660 ras_aca->ue_updated_mark = 0; 661 662 return 0; 663 } 664 665 int ras_aca_hw_fini(struct ras_core_context *ras_core) 666 { 667 struct ras_aca *ras_aca = &ras_core->ras_aca; 668 669 ras_aca->ue_updated_mark = 0; 670 671 return 0; 672 } 673