1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/list.h> 25 #include "amdgpu.h" 26 #include "amdgpu_aca.h" 27 #include "amdgpu_ras.h" 28 29 #define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype} 30 31 typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); 32 33 struct aca_banks { 34 int nr_banks; 35 struct list_head list; 36 }; 37 38 struct aca_hwip { 39 int hwid; 40 int mcatype; 41 }; 42 43 static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = { 44 ACA_BANK_HWID(SMU, 0x01, 0x01), 45 ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00), 46 ACA_BANK_HWID(UMC, 0x96, 0x00), 47 }; 48 49 static void aca_banks_init(struct aca_banks *banks) 50 { 51 if (!banks) 52 return; 53 54 memset(banks, 0, sizeof(*banks)); 55 INIT_LIST_HEAD(&banks->list); 56 } 57 58 static int aca_banks_add_bank(struct aca_banks *banks, struct aca_bank *bank) 59 { 60 struct aca_bank_node *node; 61 62 if (!bank) 63 return -EINVAL; 64 65 node = kvzalloc(sizeof(*node), GFP_KERNEL); 66 if (!node) 67 return -ENOMEM; 68 69 memcpy(&node->bank, bank, sizeof(*bank)); 70 71 INIT_LIST_HEAD(&node->node); 72 list_add_tail(&node->node, &banks->list); 73 74 banks->nr_banks++; 75 76 return 0; 77 } 78 79 static void aca_banks_release(struct aca_banks *banks) 80 { 81 struct aca_bank_node *node, *tmp; 82 83 list_for_each_entry_safe(node, tmp, &banks->list, node) { 84 list_del(&node->node); 85 kvfree(node); 86 } 87 } 88 89 static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count) 90 { 91 struct amdgpu_aca *aca = &adev->aca; 92 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; 93 94 if (!count) 95 return -EINVAL; 96 97 if (!smu_funcs || !smu_funcs->get_valid_aca_count) 98 return -EOPNOTSUPP; 99 100 return smu_funcs->get_valid_aca_count(adev, type, count); 101 } 102 103 static struct aca_regs_dump { 104 const char *name; 105 int reg_idx; 106 } aca_regs[] = { 107 {"CONTROL", ACA_REG_IDX_CTL}, 108 {"STATUS", ACA_REG_IDX_STATUS}, 109 {"ADDR", ACA_REG_IDX_ADDR}, 110 {"MISC", ACA_REG_IDX_MISC0}, 111 {"CONFIG", ACA_REG_IDX_CONFG}, 112 {"IPID", ACA_REG_IDX_IPID}, 113 {"SYND", ACA_REG_IDX_SYND}, 114 {"DESTAT", ACA_REG_IDX_DESTAT}, 115 {"DEADDR", ACA_REG_IDX_DEADDR}, 116 {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK}, 117 }; 118 119 static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank) 120 { 121 int i; 122 123 dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n"); 124 /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ 125 for (i = 0; i < ARRAY_SIZE(aca_regs); i++) 126 dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", 127 idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); 128 } 129 130 static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, 131 int start, int count, 132 struct aca_banks *banks) 133 { 134 struct amdgpu_aca *aca = &adev->aca; 135 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; 136 struct aca_bank bank; 137 int i, max_count, ret; 138 139 if (!count) 140 return 0; 141 142 if (!smu_funcs || !smu_funcs->get_valid_aca_bank) 143 return -EOPNOTSUPP; 144 145 switch (type) { 146 case ACA_SMU_TYPE_UE: 147 max_count = smu_funcs->max_ue_bank_count; 148 break; 149 case ACA_SMU_TYPE_CE: 150 max_count = smu_funcs->max_ce_bank_count; 151 break; 152 default: 153 return -EINVAL; 154 } 155 156 if (start + count >= max_count) 157 return -EINVAL; 158 159 count = min_t(int, count, max_count); 160 for (i = 0; i < count; i++) { 161 memset(&bank, 0, sizeof(bank)); 162 ret = smu_funcs->get_valid_aca_bank(adev, type, start + i, &bank); 163 if (ret) 164 return ret; 165 166 bank.type = type; 167 168 aca_smu_bank_dump(adev, i, count, &bank); 169 170 ret = aca_banks_add_bank(banks, &bank); 171 if (ret) 172 return ret; 173 } 174 175 return 0; 176 } 177 178 static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type) 179 { 180 181 struct aca_hwip *hwip; 182 int hwid, mcatype; 183 u64 ipid; 184 185 if (!bank || type == ACA_HWIP_TYPE_UNKNOW) 186 return false; 187 188 hwip = &aca_hwid_mcatypes[type]; 189 if (!hwip->hwid) 190 return false; 191 192 ipid = bank->regs[ACA_REG_IDX_IPID]; 193 hwid = ACA_REG__IPID__HARDWAREID(ipid); 194 mcatype = ACA_REG__IPID__MCATYPE(ipid); 195 196 return hwip->hwid == hwid && hwip->mcatype == mcatype; 197 } 198 199 static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) 200 { 201 const struct aca_bank_ops *bank_ops = handle->bank_ops; 202 203 if (!aca_bank_hwip_is_matched(bank, handle->hwip)) 204 return false; 205 206 if (!bank_ops->aca_bank_is_valid) 207 return true; 208 209 return bank_ops->aca_bank_is_valid(handle, bank, type, handle->data); 210 } 211 212 static struct aca_bank_error *new_bank_error(struct aca_error *aerr, struct aca_bank_info *info) 213 { 214 struct aca_bank_error *bank_error; 215 216 bank_error = kvzalloc(sizeof(*bank_error), GFP_KERNEL); 217 if (!bank_error) 218 return NULL; 219 220 INIT_LIST_HEAD(&bank_error->node); 221 memcpy(&bank_error->info, info, sizeof(*info)); 222 223 mutex_lock(&aerr->lock); 224 list_add_tail(&bank_error->node, &aerr->list); 225 mutex_unlock(&aerr->lock); 226 227 return bank_error; 228 } 229 230 static struct aca_bank_error *find_bank_error(struct aca_error *aerr, struct aca_bank_info *info) 231 { 232 struct aca_bank_error *bank_error = NULL; 233 struct aca_bank_info *tmp_info; 234 bool found = false; 235 236 mutex_lock(&aerr->lock); 237 list_for_each_entry(bank_error, &aerr->list, node) { 238 tmp_info = &bank_error->info; 239 if (tmp_info->socket_id == info->socket_id && 240 tmp_info->die_id == info->die_id) { 241 found = true; 242 goto out_unlock; 243 } 244 } 245 246 out_unlock: 247 mutex_unlock(&aerr->lock); 248 249 return found ? bank_error : NULL; 250 } 251 252 static void aca_bank_error_remove(struct aca_error *aerr, struct aca_bank_error *bank_error) 253 { 254 if (!aerr || !bank_error) 255 return; 256 257 list_del(&bank_error->node); 258 aerr->nr_errors--; 259 260 kvfree(bank_error); 261 } 262 263 static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_bank_info *info) 264 { 265 struct aca_bank_error *bank_error; 266 267 if (!aerr || !info) 268 return NULL; 269 270 bank_error = find_bank_error(aerr, info); 271 if (bank_error) 272 return bank_error; 273 274 return new_bank_error(aerr, info); 275 } 276 277 int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, 278 enum aca_error_type type, u64 count) 279 { 280 struct aca_error_cache *error_cache = &handle->error_cache; 281 struct aca_bank_error *bank_error; 282 struct aca_error *aerr; 283 284 if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT) 285 return -EINVAL; 286 287 if (!count) 288 return 0; 289 290 aerr = &error_cache->errors[type]; 291 bank_error = get_bank_error(aerr, info); 292 if (!bank_error) 293 return -ENOMEM; 294 295 bank_error->count += count; 296 297 return 0; 298 } 299 300 static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) 301 { 302 const struct aca_bank_ops *bank_ops = handle->bank_ops; 303 304 if (!bank) 305 return -EINVAL; 306 307 if (!bank_ops->aca_bank_parser) 308 return -EOPNOTSUPP; 309 310 return bank_ops->aca_bank_parser(handle, bank, type, 311 handle->data); 312 } 313 314 static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, 315 enum aca_smu_type type, void *data) 316 { 317 int ret; 318 319 ret = aca_bank_parser(handle, bank, type); 320 if (ret) 321 return ret; 322 323 return 0; 324 } 325 326 static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, 327 enum aca_smu_type type, bank_handler_t handler, void *data) 328 { 329 struct aca_handle *handle; 330 int ret; 331 332 if (list_empty(&mgr->list)) 333 return 0; 334 335 list_for_each_entry(handle, &mgr->list, node) { 336 if (!aca_bank_is_valid(handle, bank, type)) 337 continue; 338 339 ret = handler(handle, bank, type, data); 340 if (ret) 341 return ret; 342 } 343 344 return 0; 345 } 346 347 static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, 348 enum aca_smu_type type, bank_handler_t handler, void *data) 349 { 350 struct aca_bank_node *node; 351 struct aca_bank *bank; 352 int ret; 353 354 if (!mgr || !banks) 355 return -EINVAL; 356 357 /* pre check to avoid unnecessary operations */ 358 if (list_empty(&mgr->list) || list_empty(&banks->list)) 359 return 0; 360 361 list_for_each_entry(node, &banks->list, node) { 362 bank = &node->bank; 363 364 ret = aca_dispatch_bank(mgr, bank, type, handler, data); 365 if (ret) 366 return ret; 367 } 368 369 return 0; 370 } 371 372 static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, 373 bank_handler_t handler, void *data) 374 { 375 struct amdgpu_aca *aca = &adev->aca; 376 struct aca_banks banks; 377 u32 count = 0; 378 int ret; 379 380 if (list_empty(&aca->mgr.list)) 381 return 0; 382 383 ret = aca_smu_get_valid_aca_count(adev, type, &count); 384 if (ret) 385 return ret; 386 387 if (!count) 388 return 0; 389 390 aca_banks_init(&banks); 391 392 ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks); 393 if (ret) 394 goto err_release_banks; 395 396 if (list_empty(&banks.list)) { 397 ret = 0; 398 goto err_release_banks; 399 } 400 401 ret = aca_dispatch_banks(&aca->mgr, &banks, type, 402 handler, data); 403 if (ret) 404 goto err_release_banks; 405 406 err_release_banks: 407 aca_banks_release(&banks); 408 409 return ret; 410 } 411 412 static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_error_type type, struct ras_err_data *err_data) 413 { 414 struct aca_bank_info *info; 415 struct amdgpu_smuio_mcm_config_info mcm_info; 416 u64 count; 417 418 if (type >= ACA_ERROR_TYPE_COUNT) 419 return -EINVAL; 420 421 count = bank_error->count; 422 if (!count) 423 return 0; 424 425 info = &bank_error->info; 426 mcm_info.die_id = info->die_id; 427 mcm_info.socket_id = info->socket_id; 428 429 switch (type) { 430 case ACA_ERROR_TYPE_UE: 431 amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, count); 432 break; 433 case ACA_ERROR_TYPE_CE: 434 amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count); 435 break; 436 case ACA_ERROR_TYPE_DEFERRED: 437 amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count); 438 break; 439 default: 440 break; 441 } 442 443 return 0; 444 } 445 446 static int aca_log_aca_error(struct aca_handle *handle, enum aca_error_type type, struct ras_err_data *err_data) 447 { 448 struct aca_error_cache *error_cache = &handle->error_cache; 449 struct aca_error *aerr = &error_cache->errors[type]; 450 struct aca_bank_error *bank_error, *tmp; 451 452 mutex_lock(&aerr->lock); 453 454 if (list_empty(&aerr->list)) 455 goto out_unlock; 456 457 list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) { 458 aca_log_aca_error_data(bank_error, type, err_data); 459 aca_bank_error_remove(aerr, bank_error); 460 } 461 462 out_unlock: 463 mutex_unlock(&aerr->lock); 464 465 return 0; 466 } 467 468 static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, 469 struct ras_err_data *err_data) 470 { 471 enum aca_smu_type smu_type; 472 int ret; 473 474 switch (type) { 475 case ACA_ERROR_TYPE_UE: 476 smu_type = ACA_SMU_TYPE_UE; 477 break; 478 case ACA_ERROR_TYPE_CE: 479 case ACA_ERROR_TYPE_DEFERRED: 480 smu_type = ACA_SMU_TYPE_CE; 481 break; 482 default: 483 return -EINVAL; 484 } 485 486 /* udpate aca bank to aca source error_cache first */ 487 ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, NULL); 488 if (ret) 489 return ret; 490 491 return aca_log_aca_error(handle, type, err_data); 492 } 493 494 static bool aca_handle_is_valid(struct aca_handle *handle) 495 { 496 if (!handle->mask || !list_empty(&handle->node)) 497 return false; 498 499 return true; 500 } 501 502 int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, 503 enum aca_error_type type, void *data) 504 { 505 struct ras_err_data *err_data = (struct ras_err_data *)data; 506 507 if (!handle || !err_data) 508 return -EINVAL; 509 510 if (aca_handle_is_valid(handle)) 511 return -EOPNOTSUPP; 512 513 if (!(BIT(type) & handle->mask)) 514 return 0; 515 516 return __aca_get_error_data(adev, handle, type, err_data); 517 } 518 519 static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) 520 { 521 mutex_init(&aerr->lock); 522 INIT_LIST_HEAD(&aerr->list); 523 aerr->type = type; 524 aerr->nr_errors = 0; 525 } 526 527 static void aca_init_error_cache(struct aca_handle *handle) 528 { 529 struct aca_error_cache *error_cache = &handle->error_cache; 530 int type; 531 532 for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++) 533 aca_error_init(&error_cache->errors[type], type); 534 } 535 536 static void aca_error_fini(struct aca_error *aerr) 537 { 538 struct aca_bank_error *bank_error, *tmp; 539 540 mutex_lock(&aerr->lock); 541 list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) 542 aca_bank_error_remove(aerr, bank_error); 543 544 mutex_destroy(&aerr->lock); 545 } 546 547 static void aca_fini_error_cache(struct aca_handle *handle) 548 { 549 struct aca_error_cache *error_cache = &handle->error_cache; 550 int type; 551 552 for (type = ACA_ERROR_TYPE_UE; type < ACA_ERROR_TYPE_COUNT; type++) 553 aca_error_fini(&error_cache->errors[type]); 554 } 555 556 static int add_aca_handle(struct amdgpu_device *adev, struct aca_handle_manager *mgr, struct aca_handle *handle, 557 const char *name, const struct aca_info *ras_info, void *data) 558 { 559 memset(handle, 0, sizeof(*handle)); 560 561 handle->adev = adev; 562 handle->mgr = mgr; 563 handle->name = name; 564 handle->hwip = ras_info->hwip; 565 handle->mask = ras_info->mask; 566 handle->bank_ops = ras_info->bank_ops; 567 handle->data = data; 568 aca_init_error_cache(handle); 569 570 INIT_LIST_HEAD(&handle->node); 571 list_add_tail(&handle->node, &mgr->list); 572 mgr->nr_handles++; 573 574 return 0; 575 } 576 577 static ssize_t aca_sysfs_read(struct device *dev, 578 struct device_attribute *attr, char *buf) 579 { 580 struct aca_handle *handle = container_of(attr, struct aca_handle, aca_attr); 581 582 /* NOTE: the aca cache will be auto cleared once read, 583 * So the driver should unify the query entry point, forward request to ras query interface directly */ 584 return amdgpu_ras_aca_sysfs_read(dev, attr, handle, buf, handle->data); 585 } 586 587 static int add_aca_sysfs(struct amdgpu_device *adev, struct aca_handle *handle) 588 { 589 struct device_attribute *aca_attr = &handle->aca_attr; 590 591 snprintf(handle->attr_name, sizeof(handle->attr_name) - 1, "aca_%s", handle->name); 592 aca_attr->show = aca_sysfs_read; 593 aca_attr->attr.name = handle->attr_name; 594 aca_attr->attr.mode = S_IRUGO; 595 sysfs_attr_init(&aca_attr->attr); 596 597 return sysfs_add_file_to_group(&adev->dev->kobj, 598 &aca_attr->attr, 599 "ras"); 600 } 601 602 int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle, 603 const char *name, const struct aca_info *ras_info, void *data) 604 { 605 struct amdgpu_aca *aca = &adev->aca; 606 int ret; 607 608 if (!amdgpu_aca_is_enabled(adev)) 609 return 0; 610 611 ret = add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data); 612 if (ret) 613 return ret; 614 615 return add_aca_sysfs(adev, handle); 616 } 617 618 static void remove_aca_handle(struct aca_handle *handle) 619 { 620 struct aca_handle_manager *mgr = handle->mgr; 621 622 aca_fini_error_cache(handle); 623 list_del(&handle->node); 624 mgr->nr_handles--; 625 } 626 627 static void remove_aca_sysfs(struct aca_handle *handle) 628 { 629 struct amdgpu_device *adev = handle->adev; 630 struct device_attribute *aca_attr = &handle->aca_attr; 631 632 if (adev->dev->kobj.sd) 633 sysfs_remove_file_from_group(&adev->dev->kobj, 634 &aca_attr->attr, 635 "ras"); 636 } 637 638 void amdgpu_aca_remove_handle(struct aca_handle *handle) 639 { 640 if (!handle || list_empty(&handle->node)) 641 return; 642 643 remove_aca_sysfs(handle); 644 remove_aca_handle(handle); 645 } 646 647 static int aca_manager_init(struct aca_handle_manager *mgr) 648 { 649 INIT_LIST_HEAD(&mgr->list); 650 mgr->nr_handles = 0; 651 652 return 0; 653 } 654 655 static void aca_manager_fini(struct aca_handle_manager *mgr) 656 { 657 struct aca_handle *handle, *tmp; 658 659 list_for_each_entry_safe(handle, tmp, &mgr->list, node) 660 amdgpu_aca_remove_handle(handle); 661 } 662 663 bool amdgpu_aca_is_enabled(struct amdgpu_device *adev) 664 { 665 return adev->aca.is_enabled; 666 } 667 668 int amdgpu_aca_init(struct amdgpu_device *adev) 669 { 670 struct amdgpu_aca *aca = &adev->aca; 671 int ret; 672 673 ret = aca_manager_init(&aca->mgr); 674 if (ret) 675 return ret; 676 677 return 0; 678 } 679 680 void amdgpu_aca_fini(struct amdgpu_device *adev) 681 { 682 struct amdgpu_aca *aca = &adev->aca; 683 684 aca_manager_fini(&aca->mgr); 685 } 686 687 int amdgpu_aca_reset(struct amdgpu_device *adev) 688 { 689 amdgpu_aca_fini(adev); 690 691 return amdgpu_aca_init(adev); 692 } 693 694 void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs) 695 { 696 struct amdgpu_aca *aca = &adev->aca; 697 698 WARN_ON(aca->smu_funcs); 699 aca->smu_funcs = smu_funcs; 700 } 701 702 int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info) 703 { 704 u64 ipid; 705 u32 instidhi, instidlo; 706 707 if (!bank || !info) 708 return -EINVAL; 709 710 ipid = bank->regs[ACA_REG_IDX_IPID]; 711 info->hwid = ACA_REG__IPID__HARDWAREID(ipid); 712 info->mcatype = ACA_REG__IPID__MCATYPE(ipid); 713 /* 714 * Unfied DieID Format: SAASS. A:AID, S:Socket. 715 * Unfied DieID[4:4] = InstanceId[0:0] 716 * Unfied DieID[0:3] = InstanceIdHi[0:3] 717 */ 718 instidhi = ACA_REG__IPID__INSTANCEIDHI(ipid); 719 instidlo = ACA_REG__IPID__INSTANCEIDLO(ipid); 720 info->die_id = ((instidhi >> 2) & 0x03); 721 info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03); 722 723 return 0; 724 } 725 726 static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank) 727 { 728 int error_code; 729 730 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 731 case IP_VERSION(13, 0, 6): 732 if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) { 733 error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); 734 return error_code & 0xff; 735 } 736 break; 737 default: 738 break; 739 } 740 741 /* NOTE: the true error code is encoded in status.errorcode[0:7] */ 742 error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); 743 744 return error_code & 0xff; 745 } 746 747 int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size) 748 { 749 int i, error_code; 750 751 if (!bank || !err_codes) 752 return -EINVAL; 753 754 error_code = aca_bank_get_error_code(adev, bank); 755 for (i = 0; i < size; i++) { 756 if (err_codes[i] == error_code) 757 return 0; 758 } 759 760 return -EINVAL; 761 } 762 763 int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en) 764 { 765 struct amdgpu_aca *aca = &adev->aca; 766 const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; 767 768 if (!smu_funcs || !smu_funcs->set_debug_mode) 769 return -EOPNOTSUPP; 770 771 return smu_funcs->set_debug_mode(adev, en); 772 } 773 774 #if defined(CONFIG_DEBUG_FS) 775 static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) 776 { 777 struct amdgpu_device *adev = (struct amdgpu_device *)data; 778 int ret; 779 780 ret = amdgpu_ras_set_aca_debug_mode(adev, val ? true : false); 781 if (ret) 782 return ret; 783 784 dev_info(adev->dev, "amdgpu set smu aca debug mode %s success\n", val ? "on" : "off"); 785 786 return 0; 787 } 788 789 static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx) 790 { 791 struct aca_bank_info info; 792 int i, ret; 793 794 ret = aca_bank_info_decode(bank, &info); 795 if (ret) 796 return; 797 798 seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE"); 799 seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n", 800 idx, info.socket_id, info.die_id, info.hwid, info.mcatype); 801 802 for (i = 0; i < ARRAY_SIZE(aca_regs); i++) 803 seq_printf(m, "aca entry[%d].regs[%d]: 0x%016llx\n", idx, aca_regs[i].reg_idx, bank->regs[aca_regs[i].reg_idx]); 804 } 805 806 struct aca_dump_context { 807 struct seq_file *m; 808 int idx; 809 }; 810 811 static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank, 812 enum aca_smu_type type, void *data) 813 { 814 struct aca_dump_context *ctx = (struct aca_dump_context *)data; 815 816 aca_dump_entry(ctx->m, bank, type, ctx->idx++); 817 818 return handler_aca_log_bank_error(handle, bank, type, NULL); 819 } 820 821 static int aca_dump_show(struct seq_file *m, enum aca_smu_type type) 822 { 823 struct amdgpu_device *adev = (struct amdgpu_device *)m->private; 824 struct aca_dump_context context = { 825 .m = m, 826 .idx = 0, 827 }; 828 829 return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context); 830 } 831 832 static int aca_dump_ce_show(struct seq_file *m, void *unused) 833 { 834 return aca_dump_show(m, ACA_SMU_TYPE_CE); 835 } 836 837 static int aca_dump_ce_open(struct inode *inode, struct file *file) 838 { 839 return single_open(file, aca_dump_ce_show, inode->i_private); 840 } 841 842 static const struct file_operations aca_ce_dump_debug_fops = { 843 .owner = THIS_MODULE, 844 .open = aca_dump_ce_open, 845 .read = seq_read, 846 .llseek = seq_lseek, 847 .release = single_release, 848 }; 849 850 static int aca_dump_ue_show(struct seq_file *m, void *unused) 851 { 852 return aca_dump_show(m, ACA_SMU_TYPE_UE); 853 } 854 855 static int aca_dump_ue_open(struct inode *inode, struct file *file) 856 { 857 return single_open(file, aca_dump_ue_show, inode->i_private); 858 } 859 860 static const struct file_operations aca_ue_dump_debug_fops = { 861 .owner = THIS_MODULE, 862 .open = aca_dump_ue_open, 863 .read = seq_read, 864 .llseek = seq_lseek, 865 .release = single_release, 866 }; 867 868 DEFINE_DEBUGFS_ATTRIBUTE(aca_debug_mode_fops, NULL, amdgpu_aca_smu_debug_mode_set, "%llu\n"); 869 #endif 870 871 void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) 872 { 873 #if defined(CONFIG_DEBUG_FS) 874 if (!root || adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 6)) 875 return; 876 877 debugfs_create_file("aca_debug_mode", 0200, root, adev, &aca_debug_mode_fops); 878 debugfs_create_file("aca_ue_dump", 0400, root, adev, &aca_ue_dump_debug_fops); 879 debugfs_create_file("aca_ce_dump", 0400, root, adev, &aca_ce_dump_debug_fops); 880 #endif 881 } 882