1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include "amdgpu_reg_access.h" 25 #include <linux/debugfs.h> 26 #include <linux/list.h> 27 #include <linux/module.h> 28 #include <linux/uaccess.h> 29 #include <linux/reboot.h> 30 #include <linux/syscalls.h> 31 #include <linux/pm_runtime.h> 32 #include <linux/list_sort.h> 33 34 #include "amdgpu.h" 35 #include "amdgpu_ras.h" 36 #include "amdgpu_atomfirmware.h" 37 #include "amdgpu_xgmi.h" 38 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 39 #include "nbio_v4_3.h" 40 #include "nbif_v6_3_1.h" 41 #include "nbio_v7_9.h" 42 #include "atom.h" 43 #include "amdgpu_reset.h" 44 #include "amdgpu_psp.h" 45 #include "amdgpu_ras_mgr.h" 46 #include "amdgpu_virt_ras_cmd.h" 47 48 #ifdef CONFIG_X86_MCE_AMD 49 #include <asm/mce.h> 50 51 static bool notifier_registered; 52 #endif 53 static const char *RAS_FS_NAME = "ras"; 54 55 const char *ras_error_string[] = { 56 "none", 57 "parity", 58 "single_correctable", 59 "multi_uncorrectable", 60 "poison", 61 }; 62 63 const char *ras_block_string[] = { 64 "umc", 65 "sdma", 66 "gfx", 67 "mmhub", 68 "athub", 69 "pcie_bif", 70 "hdp", 71 "xgmi_wafl", 72 "df", 73 "smn", 74 "sem", 75 "mp0", 76 "mp1", 77 "fuse", 78 "mca", 79 "vcn", 80 "jpeg", 81 "ih", 82 "mpio", 83 "mmsch", 84 }; 85 86 const char *ras_mca_block_string[] = { 87 "mca_mp0", 88 "mca_mp1", 89 "mca_mpio", 90 "mca_iohc", 91 }; 92 93 struct amdgpu_ras_block_list { 94 /* ras block link */ 95 struct list_head node; 96 97 struct amdgpu_ras_block_object *ras_obj; 98 }; 99 100 const char *get_ras_block_str(struct ras_common_if *ras_block) 101 { 102 if (!ras_block) 103 return "NULL"; 104 105 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT || 106 ras_block->block >= ARRAY_SIZE(ras_block_string)) 107 return "OUT OF RANGE"; 108 109 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) 110 return ras_mca_block_string[ras_block->sub_block_index]; 111 112 return ras_block_string[ras_block->block]; 113 } 114 115 #define ras_block_str(_BLOCK_) \ 116 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") 117 118 #define ras_err_str(i) (ras_error_string[ffs(i)]) 119 120 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 121 122 /* inject address is 52 bits */ 123 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) 124 125 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 126 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) 127 128 #define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 129 130 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms 131 132 #define MAX_FLUSH_RETIRE_DWORK_TIMES 100 133 134 #define BYPASS_ALLOCATED_ADDRESS 0x0 135 #define BYPASS_INITIALIZATION_ADDRESS 0x1 136 137 enum amdgpu_ras_retire_page_reservation { 138 AMDGPU_RAS_RETIRE_PAGE_RESERVED, 139 AMDGPU_RAS_RETIRE_PAGE_PENDING, 140 AMDGPU_RAS_RETIRE_PAGE_FAULT, 141 }; 142 143 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); 144 145 static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 146 uint64_t addr); 147 static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 148 uint64_t addr); 149 150 static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); 151 static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); 152 153 #ifdef CONFIG_X86_MCE_AMD 154 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); 155 static void 156 amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev); 157 struct mce_notifier_adev_list { 158 struct amdgpu_device *devs[MAX_GPU_INSTANCE]; 159 int num_gpu; 160 }; 161 static struct mce_notifier_adev_list mce_adev_list; 162 #endif 163 164 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) 165 { 166 if (adev && amdgpu_ras_get_context(adev)) 167 amdgpu_ras_get_context(adev)->error_query_ready = ready; 168 } 169 170 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) 171 { 172 if (adev && amdgpu_ras_get_context(adev)) 173 return amdgpu_ras_get_context(adev)->error_query_ready; 174 175 return false; 176 } 177 178 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) 179 { 180 struct ras_err_data err_data; 181 struct eeprom_table_record err_rec; 182 int ret; 183 184 ret = amdgpu_ras_check_bad_page(adev, address); 185 if (ret == -EINVAL) { 186 dev_warn(adev->dev, 187 "RAS WARN: input address 0x%llx is invalid.\n", 188 address); 189 return -EINVAL; 190 } else if (ret == 1) { 191 dev_warn(adev->dev, 192 "RAS WARN: 0x%llx has already been marked as bad page!\n", 193 address); 194 return 0; 195 } 196 197 ret = amdgpu_ras_error_data_init(&err_data); 198 if (ret) 199 return ret; 200 201 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); 202 err_data.err_addr = &err_rec; 203 amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); 204 205 if (amdgpu_bad_page_threshold != 0) { 206 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 207 err_data.err_addr_cnt, false); 208 amdgpu_ras_save_bad_pages(adev, NULL); 209 } 210 211 amdgpu_ras_error_data_fini(&err_data); 212 213 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); 214 dev_warn(adev->dev, "Clear EEPROM:\n"); 215 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); 216 217 return 0; 218 } 219 220 static int amdgpu_check_address_validity(struct amdgpu_device *adev, 221 uint64_t address, uint64_t flags) 222 { 223 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 224 struct amdgpu_vram_block_info blk_info; 225 uint64_t page_pfns[32] = {0}; 226 int i, ret, count; 227 bool hit = false; 228 229 if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) 230 return 0; 231 232 if (amdgpu_sriov_vf(adev)) { 233 if (amdgpu_uniras_enabled(adev)) { 234 if (amdgpu_virt_ras_check_address_validity(adev, address, &hit)) 235 return -EPERM; 236 if (hit) 237 return -EACCES; 238 } else { 239 if (amdgpu_virt_check_vf_critical_region(adev, address, &hit)) 240 return -EPERM; 241 return hit ? -EACCES : 0; 242 } 243 } 244 245 if ((address >= adev->gmc.mc_vram_size) || 246 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) 247 return -EFAULT; 248 249 if (amdgpu_uniras_enabled(adev)) { 250 if (amdgpu_sriov_vf(adev)) 251 count = amdgpu_virt_ras_convert_retired_address(adev, address, 252 page_pfns, ARRAY_SIZE(page_pfns)); 253 else 254 count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address, 255 page_pfns, ARRAY_SIZE(page_pfns)); 256 } else 257 count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, 258 address, page_pfns, ARRAY_SIZE(page_pfns)); 259 260 if (count <= 0) 261 return -EPERM; 262 263 for (i = 0; i < count; i++) { 264 memset(&blk_info, 0, sizeof(blk_info)); 265 ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr, 266 page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info); 267 if (!ret) { 268 /* The input address that needs to be checked is allocated by 269 * current calling process, so it is necessary to exclude 270 * the calling process. 271 */ 272 if ((flags == BYPASS_ALLOCATED_ADDRESS) && 273 ((blk_info.task.pid != task_pid_nr(current)) || 274 strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN))) 275 return -EACCES; 276 else if ((flags == BYPASS_INITIALIZATION_ADDRESS) && 277 (blk_info.task.pid == con->init_task_pid) && 278 !strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN)) 279 return -EACCES; 280 } 281 } 282 283 return 0; 284 } 285 286 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 287 size_t size, loff_t *pos) 288 { 289 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 290 struct ras_query_if info = { 291 .head = obj->head, 292 }; 293 ssize_t s; 294 char val[128]; 295 296 if (amdgpu_ras_query_error_status(obj->adev, &info)) 297 return -EINVAL; 298 299 /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ 300 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 301 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 302 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 303 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 304 } 305 306 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 307 "ue", info.ue_count, 308 "ce", info.ce_count); 309 if (*pos >= s) 310 return 0; 311 312 s -= *pos; 313 s = min_t(u64, s, size); 314 315 316 if (copy_to_user(buf, &val[*pos], s)) 317 return -EINVAL; 318 319 *pos += s; 320 321 return s; 322 } 323 324 static const struct file_operations amdgpu_ras_debugfs_ops = { 325 .owner = THIS_MODULE, 326 .read = amdgpu_ras_debugfs_read, 327 .write = NULL, 328 .llseek = default_llseek 329 }; 330 331 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 332 { 333 int i; 334 335 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 336 *block_id = i; 337 if (strcmp(name, ras_block_string[i]) == 0) 338 return 0; 339 } 340 return -EINVAL; 341 } 342 343 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 344 const char __user *buf, size_t size, 345 loff_t *pos, struct ras_debug_if *data) 346 { 347 ssize_t s = min_t(u64, 64, size); 348 char str[65]; 349 char block_name[33]; 350 char err[9] = "ue"; 351 int op = -1; 352 int block_id; 353 uint32_t sub_block; 354 u64 address, value; 355 /* default value is 0 if the mask is not set by user */ 356 u32 instance_mask = 0; 357 358 if (*pos) 359 return -EINVAL; 360 *pos = size; 361 362 memset(str, 0, sizeof(str)); 363 memset(data, 0, sizeof(*data)); 364 365 if (copy_from_user(str, buf, s)) 366 return -EINVAL; 367 368 if (sscanf(str, "disable %32s", block_name) == 1) 369 op = 0; 370 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 371 op = 1; 372 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 373 op = 2; 374 else if (strstr(str, "retire_page") != NULL) 375 op = 3; 376 else if (strstr(str, "check_address") != NULL) 377 op = 4; 378 else if (str[0] && str[1] && str[2] && str[3]) 379 /* ascii string, but commands are not matched. */ 380 return -EINVAL; 381 382 if (op != -1) { 383 if (op == 3) { 384 if (sscanf(str, "%*s 0x%llx", &address) != 1 && 385 sscanf(str, "%*s %llu", &address) != 1) 386 return -EINVAL; 387 388 data->op = op; 389 data->inject.address = address; 390 391 return 0; 392 } else if (op == 4) { 393 if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 && 394 sscanf(str, "%*s %llu %llu", &address, &value) != 2) 395 return -EINVAL; 396 397 data->op = op; 398 data->inject.address = address; 399 data->inject.value = value; 400 return 0; 401 } 402 403 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 404 return -EINVAL; 405 406 data->head.block = block_id; 407 /* only ue, ce and poison errors are supported */ 408 if (!memcmp("ue", err, 2)) 409 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 410 else if (!memcmp("ce", err, 2)) 411 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 412 else if (!memcmp("poison", err, 6)) 413 data->head.type = AMDGPU_RAS_ERROR__POISON; 414 else 415 return -EINVAL; 416 417 data->op = op; 418 419 if (op == 2) { 420 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", 421 &sub_block, &address, &value, &instance_mask) != 4 && 422 sscanf(str, "%*s %*s %*s %u %llu %llu %u", 423 &sub_block, &address, &value, &instance_mask) != 4 && 424 sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", 425 &sub_block, &address, &value) != 3 && 426 sscanf(str, "%*s %*s %*s %u %llu %llu", 427 &sub_block, &address, &value) != 3) 428 return -EINVAL; 429 data->head.sub_block_index = sub_block; 430 data->inject.address = address; 431 data->inject.value = value; 432 data->inject.instance_mask = instance_mask; 433 } 434 } else { 435 if (size < sizeof(*data)) 436 return -EINVAL; 437 438 if (copy_from_user(data, buf, sizeof(*data))) 439 return -EINVAL; 440 } 441 442 return 0; 443 } 444 445 static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, 446 struct ras_debug_if *data) 447 { 448 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; 449 uint32_t mask, inst_mask = data->inject.instance_mask; 450 451 /* no need to set instance mask if there is only one instance */ 452 if (num_xcc <= 1 && inst_mask) { 453 data->inject.instance_mask = 0; 454 dev_dbg(adev->dev, 455 "RAS inject mask(0x%x) isn't supported and force it to 0.\n", 456 inst_mask); 457 458 return; 459 } 460 461 switch (data->head.block) { 462 case AMDGPU_RAS_BLOCK__GFX: 463 mask = GENMASK(num_xcc - 1, 0); 464 break; 465 case AMDGPU_RAS_BLOCK__SDMA: 466 mask = GENMASK(adev->sdma.num_instances - 1, 0); 467 break; 468 case AMDGPU_RAS_BLOCK__VCN: 469 case AMDGPU_RAS_BLOCK__JPEG: 470 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); 471 break; 472 default: 473 mask = inst_mask; 474 break; 475 } 476 477 /* remove invalid bits in instance mask */ 478 data->inject.instance_mask &= mask; 479 if (inst_mask != data->inject.instance_mask) 480 dev_dbg(adev->dev, 481 "Adjust RAS inject mask 0x%x to 0x%x\n", 482 inst_mask, data->inject.instance_mask); 483 } 484 485 /** 486 * DOC: AMDGPU RAS debugfs control interface 487 * 488 * The control interface accepts struct ras_debug_if which has two members. 489 * 490 * First member: ras_debug_if::head or ras_debug_if::inject. 491 * 492 * head is used to indicate which IP block will be under control. 493 * 494 * head has four members, they are block, type, sub_block_index, name. 495 * block: which IP will be under control. 496 * type: what kind of error will be enabled/disabled/injected. 497 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 498 * name: the name of IP. 499 * 500 * inject has three more members than head, they are address, value and mask. 501 * As their names indicate, inject operation will write the 502 * value to the address. 503 * 504 * The second member: struct ras_debug_if::op. 505 * It has three kinds of operations. 506 * 507 * - 0: disable RAS on the block. Take ::head as its data. 508 * - 1: enable RAS on the block. Take ::head as its data. 509 * - 2: inject errors on the block. Take ::inject as its data. 510 * 511 * How to use the interface? 512 * 513 * In a program 514 * 515 * Copy the struct ras_debug_if in your code and initialize it. 516 * Write the struct to the control interface. 517 * 518 * From shell 519 * 520 * .. code-block:: bash 521 * 522 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 523 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 524 * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 525 * 526 * Where N, is the card which you want to affect. 527 * 528 * "disable" requires only the block. 529 * "enable" requires the block and error type. 530 * "inject" requires the block, error type, address, and value. 531 * 532 * The block is one of: umc, sdma, gfx, etc. 533 * see ras_block_string[] for details 534 * 535 * The error type is one of: ue, ce and poison where, 536 * ue is multi-uncorrectable 537 * ce is single-correctable 538 * poison is poison 539 * 540 * The sub-block is a the sub-block index, pass 0 if there is no sub-block. 541 * The address and value are hexadecimal numbers, leading 0x is optional. 542 * The mask means instance mask, is optional, default value is 0x1. 543 * 544 * For instance, 545 * 546 * .. code-block:: bash 547 * 548 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 549 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl 550 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 551 * 552 * How to check the result of the operation? 553 * 554 * To check disable/enable, see "ras" features at, 555 * /sys/class/drm/card[0/1/2...]/device/ras/features 556 * 557 * To check inject, see the corresponding error count at, 558 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count 559 * 560 * .. note:: 561 * Operations are only allowed on blocks which are supported. 562 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask 563 * to see which blocks support RAS on a particular asic. 564 * 565 */ 566 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, 567 const char __user *buf, 568 size_t size, loff_t *pos) 569 { 570 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 571 struct ras_debug_if data; 572 int ret = 0; 573 574 if (!amdgpu_ras_get_error_query_ready(adev)) { 575 dev_warn(adev->dev, "RAS WARN: error injection " 576 "currently inaccessible\n"); 577 return size; 578 } 579 580 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 581 if (ret) 582 return ret; 583 584 if (data.op == 3) { 585 ret = amdgpu_reserve_page_direct(adev, data.inject.address); 586 if (!ret) 587 return size; 588 else 589 return ret; 590 } else if (data.op == 4) { 591 ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value); 592 return ret ? ret : size; 593 } 594 595 if (!amdgpu_ras_is_supported(adev, data.head.block)) 596 return -EINVAL; 597 598 switch (data.op) { 599 case 0: 600 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 601 break; 602 case 1: 603 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 604 break; 605 case 2: 606 /* umc ce/ue error injection for a bad page is not allowed */ 607 if (data.head.block == AMDGPU_RAS_BLOCK__UMC) 608 ret = amdgpu_ras_check_bad_page(adev, data.inject.address); 609 if (ret == -EINVAL) { 610 dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", 611 data.inject.address); 612 break; 613 } else if (ret == 1) { 614 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", 615 data.inject.address); 616 break; 617 } 618 619 amdgpu_ras_instance_mask_check(adev, &data); 620 621 /* data.inject.address is offset instead of absolute gpu address */ 622 ret = amdgpu_ras_error_inject(adev, &data.inject); 623 break; 624 default: 625 ret = -EINVAL; 626 break; 627 } 628 629 if (ret) 630 return ret; 631 632 return size; 633 } 634 635 static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev); 636 637 /** 638 * DOC: AMDGPU RAS debugfs EEPROM table reset interface 639 * 640 * Some boards contain an EEPROM which is used to persistently store a list of 641 * bad pages which experiences ECC errors in vram. This interface provides 642 * a way to reset the EEPROM, e.g., after testing error injection. 643 * 644 * Usage: 645 * 646 * .. code-block:: bash 647 * 648 * echo 1 > ../ras/ras_eeprom_reset 649 * 650 * will reset EEPROM table to 0 entries. 651 * 652 */ 653 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, 654 const char __user *buf, 655 size_t size, loff_t *pos) 656 { 657 struct amdgpu_device *adev = 658 (struct amdgpu_device *)file_inode(f)->i_private; 659 int ret; 660 661 if (amdgpu_uniras_enabled(adev)) { 662 ret = amdgpu_uniras_clear_badpages_info(adev); 663 return ret ? ret : size; 664 } 665 666 ret = amdgpu_ras_eeprom_reset_table( 667 &(amdgpu_ras_get_context(adev)->eeprom_control)); 668 669 if (!ret) { 670 /* Something was written to EEPROM. 671 */ 672 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; 673 return size; 674 } else { 675 return ret; 676 } 677 } 678 679 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 680 .owner = THIS_MODULE, 681 .read = NULL, 682 .write = amdgpu_ras_debugfs_ctrl_write, 683 .llseek = default_llseek 684 }; 685 686 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { 687 .owner = THIS_MODULE, 688 .read = NULL, 689 .write = amdgpu_ras_debugfs_eeprom_write, 690 .llseek = default_llseek 691 }; 692 693 /** 694 * DOC: AMDGPU RAS sysfs Error Count Interface 695 * 696 * It allows the user to read the error count for each IP block on the gpu through 697 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 698 * 699 * It outputs the multiple lines which report the uncorrected (ue) and corrected 700 * (ce) error counts. 701 * 702 * The format of one line is below, 703 * 704 * [ce|ue]: count 705 * 706 * Example: 707 * 708 * .. code-block:: bash 709 * 710 * ue: 0 711 * ce: 1 712 * 713 */ 714 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 715 struct device_attribute *attr, char *buf) 716 { 717 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 718 struct ras_query_if info = { 719 .head = obj->head, 720 }; 721 722 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 723 return sysfs_emit(buf, "Query currently inaccessible\n"); 724 725 if (amdgpu_ras_query_error_status(obj->adev, &info)) 726 return -EINVAL; 727 728 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 729 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 730 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 731 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 732 } 733 734 if (info.head.block == AMDGPU_RAS_BLOCK__UMC) 735 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, 736 "ce", info.ce_count, "de", info.de_count); 737 else 738 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, 739 "ce", info.ce_count); 740 } 741 742 /* obj begin */ 743 744 #define get_obj(obj) do { (obj)->use++; } while (0) 745 #define alive_obj(obj) ((obj)->use) 746 747 static inline void put_obj(struct ras_manager *obj) 748 { 749 if (obj && (--obj->use == 0)) { 750 list_del(&obj->node); 751 amdgpu_ras_error_data_fini(&obj->err_data); 752 } 753 754 if (obj && (obj->use < 0)) 755 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); 756 } 757 758 /* make one obj and return it. */ 759 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 760 struct ras_common_if *head) 761 { 762 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 763 struct ras_manager *obj; 764 765 if (!adev->ras_enabled || !con) 766 return NULL; 767 768 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 769 return NULL; 770 771 if (head->block == AMDGPU_RAS_BLOCK__MCA) { 772 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 773 return NULL; 774 775 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 776 } else 777 obj = &con->objs[head->block]; 778 779 /* already exist. return obj? */ 780 if (alive_obj(obj)) 781 return NULL; 782 783 if (amdgpu_ras_error_data_init(&obj->err_data)) 784 return NULL; 785 786 obj->head = *head; 787 obj->adev = adev; 788 list_add(&obj->node, &con->head); 789 get_obj(obj); 790 791 return obj; 792 } 793 794 /* return an obj equal to head, or the first when head is NULL */ 795 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 796 struct ras_common_if *head) 797 { 798 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 799 struct ras_manager *obj; 800 int i; 801 802 if (!adev->ras_enabled || !con) 803 return NULL; 804 805 if (head) { 806 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 807 return NULL; 808 809 if (head->block == AMDGPU_RAS_BLOCK__MCA) { 810 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 811 return NULL; 812 813 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 814 } else 815 obj = &con->objs[head->block]; 816 817 if (alive_obj(obj)) 818 return obj; 819 } else { 820 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 821 obj = &con->objs[i]; 822 if (alive_obj(obj)) 823 return obj; 824 } 825 } 826 827 return NULL; 828 } 829 /* obj end */ 830 831 /* feature ctl begin */ 832 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 833 struct ras_common_if *head) 834 { 835 return adev->ras_hw_enabled & BIT(head->block); 836 } 837 838 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 839 struct ras_common_if *head) 840 { 841 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 842 843 return con->features & BIT(head->block); 844 } 845 846 /* 847 * if obj is not created, then create one. 848 * set feature enable flag. 849 */ 850 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 851 struct ras_common_if *head, int enable) 852 { 853 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 854 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 855 856 /* If hardware does not support ras, then do not create obj. 857 * But if hardware support ras, we can create the obj. 858 * Ras framework checks con->hw_supported to see if it need do 859 * corresponding initialization. 860 * IP checks con->support to see if it need disable ras. 861 */ 862 if (!amdgpu_ras_is_feature_allowed(adev, head)) 863 return 0; 864 865 if (enable) { 866 if (!obj) { 867 obj = amdgpu_ras_create_obj(adev, head); 868 if (!obj) 869 return -EINVAL; 870 } else { 871 /* In case we create obj somewhere else */ 872 get_obj(obj); 873 } 874 con->features |= BIT(head->block); 875 } else { 876 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 877 con->features &= ~BIT(head->block); 878 put_obj(obj); 879 } 880 } 881 882 return 0; 883 } 884 885 /* wrapper of psp_ras_enable_features */ 886 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 887 struct ras_common_if *head, bool enable) 888 { 889 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 890 union ta_ras_cmd_input *info; 891 int ret; 892 893 if (!con) 894 return -EINVAL; 895 896 /* For non-gfx ip, do not enable ras feature if it is not allowed */ 897 /* For gfx ip, regardless of feature support status, */ 898 /* Force issue enable or disable ras feature commands */ 899 if (head->block != AMDGPU_RAS_BLOCK__GFX && 900 !amdgpu_ras_is_feature_allowed(adev, head)) 901 return 0; 902 903 /* Only enable gfx ras feature from host side */ 904 if (head->block == AMDGPU_RAS_BLOCK__GFX && 905 !amdgpu_sriov_vf(adev) && 906 !amdgpu_ras_intr_triggered()) { 907 info = kzalloc_obj(union ta_ras_cmd_input); 908 if (!info) 909 return -ENOMEM; 910 911 if (!enable) { 912 info->disable_features = (struct ta_ras_disable_features_input) { 913 .block_id = amdgpu_ras_block_to_ta(head->block), 914 .error_type = amdgpu_ras_error_to_ta(head->type), 915 }; 916 } else { 917 info->enable_features = (struct ta_ras_enable_features_input) { 918 .block_id = amdgpu_ras_block_to_ta(head->block), 919 .error_type = amdgpu_ras_error_to_ta(head->type), 920 }; 921 } 922 923 ret = psp_ras_enable_features(&adev->psp, info, enable); 924 if (ret) { 925 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", 926 enable ? "enable":"disable", 927 get_ras_block_str(head), 928 amdgpu_ras_is_poison_mode_supported(adev), ret); 929 kfree(info); 930 return ret; 931 } 932 933 kfree(info); 934 } 935 936 /* setup the obj */ 937 __amdgpu_ras_feature_enable(adev, head, enable); 938 939 return 0; 940 } 941 942 /* Only used in device probe stage and called only once. */ 943 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 944 struct ras_common_if *head, bool enable) 945 { 946 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 947 int ret; 948 949 if (!con) 950 return -EINVAL; 951 952 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 953 if (enable) { 954 /* There is no harm to issue a ras TA cmd regardless of 955 * the currecnt ras state. 956 * If current state == target state, it will do nothing 957 * But sometimes it requests driver to reset and repost 958 * with error code -EAGAIN. 959 */ 960 ret = amdgpu_ras_feature_enable(adev, head, 1); 961 /* With old ras TA, we might fail to enable ras. 962 * Log it and just setup the object. 963 * TODO need remove this WA in the future. 964 */ 965 if (ret == -EINVAL) { 966 ret = __amdgpu_ras_feature_enable(adev, head, 1); 967 if (!ret) 968 dev_info(adev->dev, 969 "RAS INFO: %s setup object\n", 970 get_ras_block_str(head)); 971 } 972 } else { 973 /* setup the object then issue a ras TA disable cmd.*/ 974 ret = __amdgpu_ras_feature_enable(adev, head, 1); 975 if (ret) 976 return ret; 977 978 /* gfx block ras disable cmd must send to ras-ta */ 979 if (head->block == AMDGPU_RAS_BLOCK__GFX) 980 con->features |= BIT(head->block); 981 982 ret = amdgpu_ras_feature_enable(adev, head, 0); 983 984 /* clean gfx block ras features flag */ 985 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) 986 con->features &= ~BIT(head->block); 987 } 988 } else 989 ret = amdgpu_ras_feature_enable(adev, head, enable); 990 991 return ret; 992 } 993 994 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 995 bool bypass) 996 { 997 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 998 struct ras_manager *obj, *tmp; 999 1000 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1001 /* bypass psp. 1002 * aka just release the obj and corresponding flags 1003 */ 1004 if (bypass) { 1005 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 1006 break; 1007 } else { 1008 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 1009 break; 1010 } 1011 } 1012 1013 return con->features; 1014 } 1015 1016 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 1017 bool bypass) 1018 { 1019 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1020 int i; 1021 const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE; 1022 1023 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 1024 struct ras_common_if head = { 1025 .block = i, 1026 .type = default_ras_type, 1027 .sub_block_index = 0, 1028 }; 1029 1030 if (i == AMDGPU_RAS_BLOCK__MCA) 1031 continue; 1032 1033 if (bypass) { 1034 /* 1035 * bypass psp. vbios enable ras for us. 1036 * so just create the obj 1037 */ 1038 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 1039 break; 1040 } else { 1041 if (amdgpu_ras_feature_enable(adev, &head, 1)) 1042 break; 1043 } 1044 } 1045 1046 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 1047 struct ras_common_if head = { 1048 .block = AMDGPU_RAS_BLOCK__MCA, 1049 .type = default_ras_type, 1050 .sub_block_index = i, 1051 }; 1052 1053 if (bypass) { 1054 /* 1055 * bypass psp. vbios enable ras for us. 1056 * so just create the obj 1057 */ 1058 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 1059 break; 1060 } else { 1061 if (amdgpu_ras_feature_enable(adev, &head, 1)) 1062 break; 1063 } 1064 } 1065 1066 return con->features; 1067 } 1068 /* feature ctl end */ 1069 1070 static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, 1071 enum amdgpu_ras_block block) 1072 { 1073 if (!block_obj) 1074 return -EINVAL; 1075 1076 if (block_obj->ras_comm.block == block) 1077 return 0; 1078 1079 return -EINVAL; 1080 } 1081 1082 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, 1083 enum amdgpu_ras_block block, uint32_t sub_block_index) 1084 { 1085 struct amdgpu_ras_block_list *node, *tmp; 1086 struct amdgpu_ras_block_object *obj; 1087 1088 if (block >= AMDGPU_RAS_BLOCK__LAST) 1089 return NULL; 1090 1091 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 1092 if (!node->ras_obj) { 1093 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 1094 continue; 1095 } 1096 1097 obj = node->ras_obj; 1098 if (obj->ras_block_match) { 1099 if (obj->ras_block_match(obj, block, sub_block_index) == 0) 1100 return obj; 1101 } else { 1102 if (amdgpu_ras_block_match_default(obj, block) == 0) 1103 return obj; 1104 } 1105 } 1106 1107 return NULL; 1108 } 1109 1110 static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) 1111 { 1112 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1113 int ret = 0; 1114 1115 /* 1116 * choosing right query method according to 1117 * whether smu support query error information 1118 */ 1119 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); 1120 if (ret == -EOPNOTSUPP) { 1121 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 1122 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) 1123 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); 1124 1125 /* umc query_ras_error_address is also responsible for clearing 1126 * error status 1127 */ 1128 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 1129 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) 1130 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); 1131 } else if (!ret) { 1132 if (adev->umc.ras && 1133 adev->umc.ras->ecc_info_query_ras_error_count) 1134 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); 1135 1136 if (adev->umc.ras && 1137 adev->umc.ras->ecc_info_query_ras_error_address) 1138 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); 1139 } 1140 } 1141 1142 static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, 1143 struct ras_manager *ras_mgr, 1144 struct ras_err_data *err_data, 1145 struct ras_query_context *qctx, 1146 const char *blk_name, 1147 bool is_ue, 1148 bool is_de) 1149 { 1150 struct amdgpu_smuio_mcm_config_info *mcm_info; 1151 struct ras_err_node *err_node; 1152 struct ras_err_info *err_info; 1153 u64 event_id = qctx->evid.event_id; 1154 1155 if (is_ue) { 1156 for_each_ras_error(err_node, err_data) { 1157 err_info = &err_node->err_info; 1158 mcm_info = &err_info->mcm_info; 1159 if (err_info->ue_count) { 1160 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1161 "%lld new uncorrectable hardware errors detected in %s block\n", 1162 mcm_info->socket_id, 1163 mcm_info->die_id, 1164 err_info->ue_count, 1165 blk_name); 1166 } 1167 } 1168 1169 for_each_ras_error(err_node, &ras_mgr->err_data) { 1170 err_info = &err_node->err_info; 1171 mcm_info = &err_info->mcm_info; 1172 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1173 "%lld uncorrectable hardware errors detected in total in %s block\n", 1174 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); 1175 } 1176 1177 } else { 1178 if (is_de) { 1179 for_each_ras_error(err_node, err_data) { 1180 err_info = &err_node->err_info; 1181 mcm_info = &err_info->mcm_info; 1182 if (err_info->de_count) { 1183 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1184 "%lld new deferred hardware errors detected in %s block\n", 1185 mcm_info->socket_id, 1186 mcm_info->die_id, 1187 err_info->de_count, 1188 blk_name); 1189 } 1190 } 1191 1192 for_each_ras_error(err_node, &ras_mgr->err_data) { 1193 err_info = &err_node->err_info; 1194 mcm_info = &err_info->mcm_info; 1195 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1196 "%lld deferred hardware errors detected in total in %s block\n", 1197 mcm_info->socket_id, mcm_info->die_id, 1198 err_info->de_count, blk_name); 1199 } 1200 } else { 1201 if (adev->debug_disable_ce_logs) 1202 return; 1203 1204 for_each_ras_error(err_node, err_data) { 1205 err_info = &err_node->err_info; 1206 mcm_info = &err_info->mcm_info; 1207 if (err_info->ce_count) { 1208 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1209 "%lld new correctable hardware errors detected in %s block\n", 1210 mcm_info->socket_id, 1211 mcm_info->die_id, 1212 err_info->ce_count, 1213 blk_name); 1214 } 1215 } 1216 1217 for_each_ras_error(err_node, &ras_mgr->err_data) { 1218 err_info = &err_node->err_info; 1219 mcm_info = &err_info->mcm_info; 1220 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1221 "%lld correctable hardware errors detected in total in %s block\n", 1222 mcm_info->socket_id, mcm_info->die_id, 1223 err_info->ce_count, blk_name); 1224 } 1225 } 1226 } 1227 } 1228 1229 static inline bool err_data_has_source_info(struct ras_err_data *data) 1230 { 1231 return !list_empty(&data->err_node_list); 1232 } 1233 1234 static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, 1235 struct ras_query_if *query_if, 1236 struct ras_err_data *err_data, 1237 struct ras_query_context *qctx) 1238 { 1239 struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); 1240 const char *blk_name = get_ras_block_str(&query_if->head); 1241 u64 event_id = qctx->evid.event_id; 1242 1243 if (err_data->ce_count) { 1244 if (err_data_has_source_info(err_data)) { 1245 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1246 blk_name, false, false); 1247 } else if (!adev->aid_mask && 1248 adev->smuio.funcs && 1249 adev->smuio.funcs->get_socket_id && 1250 adev->smuio.funcs->get_die_id) { 1251 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1252 "%ld correctable hardware errors " 1253 "detected in %s block\n", 1254 adev->smuio.funcs->get_socket_id(adev), 1255 adev->smuio.funcs->get_die_id(adev), 1256 ras_mgr->err_data.ce_count, 1257 blk_name); 1258 } else { 1259 RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors " 1260 "detected in %s block\n", 1261 ras_mgr->err_data.ce_count, 1262 blk_name); 1263 } 1264 } 1265 1266 if (err_data->ue_count) { 1267 if (err_data_has_source_info(err_data)) { 1268 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1269 blk_name, true, false); 1270 } else if (!adev->aid_mask && 1271 adev->smuio.funcs && 1272 adev->smuio.funcs->get_socket_id && 1273 adev->smuio.funcs->get_die_id) { 1274 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1275 "%ld uncorrectable hardware errors " 1276 "detected in %s block\n", 1277 adev->smuio.funcs->get_socket_id(adev), 1278 adev->smuio.funcs->get_die_id(adev), 1279 ras_mgr->err_data.ue_count, 1280 blk_name); 1281 } else { 1282 RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors " 1283 "detected in %s block\n", 1284 ras_mgr->err_data.ue_count, 1285 blk_name); 1286 } 1287 } 1288 1289 if (err_data->de_count) { 1290 if (err_data_has_source_info(err_data)) { 1291 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1292 blk_name, false, true); 1293 } else if (!adev->aid_mask && 1294 adev->smuio.funcs && 1295 adev->smuio.funcs->get_socket_id && 1296 adev->smuio.funcs->get_die_id) { 1297 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1298 "%ld deferred hardware errors " 1299 "detected in %s block\n", 1300 adev->smuio.funcs->get_socket_id(adev), 1301 adev->smuio.funcs->get_die_id(adev), 1302 ras_mgr->err_data.de_count, 1303 blk_name); 1304 } else { 1305 RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors " 1306 "detected in %s block\n", 1307 ras_mgr->err_data.de_count, 1308 blk_name); 1309 } 1310 } 1311 } 1312 1313 static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev, 1314 struct ras_query_if *query_if, 1315 struct ras_err_data *err_data, 1316 struct ras_query_context *qctx) 1317 { 1318 unsigned long new_ue, new_ce, new_de; 1319 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); 1320 const char *blk_name = get_ras_block_str(&query_if->head); 1321 u64 event_id = qctx->evid.event_id; 1322 1323 new_ce = err_data->ce_count - obj->err_data.ce_count; 1324 new_ue = err_data->ue_count - obj->err_data.ue_count; 1325 new_de = err_data->de_count - obj->err_data.de_count; 1326 1327 if (new_ce) { 1328 RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors " 1329 "detected in %s block\n", 1330 new_ce, 1331 blk_name); 1332 } 1333 1334 if (new_ue) { 1335 RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors " 1336 "detected in %s block\n", 1337 new_ue, 1338 blk_name); 1339 } 1340 1341 if (new_de) { 1342 RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors " 1343 "detected in %s block\n", 1344 new_de, 1345 blk_name); 1346 } 1347 } 1348 1349 static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) 1350 { 1351 struct ras_err_node *err_node; 1352 struct ras_err_info *err_info; 1353 1354 if (err_data_has_source_info(err_data)) { 1355 for_each_ras_error(err_node, err_data) { 1356 err_info = &err_node->err_info; 1357 amdgpu_ras_error_statistic_de_count(&obj->err_data, 1358 &err_info->mcm_info, err_info->de_count); 1359 amdgpu_ras_error_statistic_ce_count(&obj->err_data, 1360 &err_info->mcm_info, err_info->ce_count); 1361 amdgpu_ras_error_statistic_ue_count(&obj->err_data, 1362 &err_info->mcm_info, err_info->ue_count); 1363 } 1364 } else { 1365 /* for legacy asic path which doesn't has error source info */ 1366 obj->err_data.ue_count += err_data->ue_count; 1367 obj->err_data.ce_count += err_data->ce_count; 1368 obj->err_data.de_count += err_data->de_count; 1369 } 1370 } 1371 1372 static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj, 1373 struct ras_err_data *err_data) 1374 { 1375 /* Host reports absolute counts */ 1376 obj->err_data.ue_count = err_data->ue_count; 1377 obj->err_data.ce_count = err_data->ce_count; 1378 obj->err_data.de_count = err_data->de_count; 1379 } 1380 1381 static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) 1382 { 1383 struct ras_common_if head; 1384 1385 memset(&head, 0, sizeof(head)); 1386 head.block = blk; 1387 1388 return amdgpu_ras_find_obj(adev, &head); 1389 } 1390 1391 int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 1392 const struct aca_info *aca_info, void *data) 1393 { 1394 struct ras_manager *obj; 1395 1396 /* in resume phase, no need to create aca fs node */ 1397 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) 1398 return 0; 1399 1400 obj = get_ras_manager(adev, blk); 1401 if (!obj) 1402 return -EINVAL; 1403 1404 return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); 1405 } 1406 1407 int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) 1408 { 1409 struct ras_manager *obj; 1410 1411 obj = get_ras_manager(adev, blk); 1412 if (!obj) 1413 return -EINVAL; 1414 1415 amdgpu_aca_remove_handle(&obj->aca_handle); 1416 1417 return 0; 1418 } 1419 1420 static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 1421 enum aca_error_type type, struct ras_err_data *err_data, 1422 struct ras_query_context *qctx) 1423 { 1424 struct ras_manager *obj; 1425 1426 obj = get_ras_manager(adev, blk); 1427 if (!obj) 1428 return -EINVAL; 1429 1430 return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); 1431 } 1432 1433 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, 1434 struct aca_handle *handle, char *buf, void *data) 1435 { 1436 struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle); 1437 struct ras_query_if info = { 1438 .head = obj->head, 1439 }; 1440 1441 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 1442 return sysfs_emit(buf, "Query currently inaccessible\n"); 1443 1444 if (amdgpu_ras_query_error_status(obj->adev, &info)) 1445 return -EINVAL; 1446 1447 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, 1448 "ce", info.ce_count, "de", info.de_count); 1449 } 1450 1451 static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, 1452 struct ras_query_if *info, 1453 struct ras_err_data *err_data, 1454 struct ras_query_context *qctx, 1455 unsigned int error_query_mode) 1456 { 1457 enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; 1458 struct amdgpu_ras_block_object *block_obj = NULL; 1459 int ret; 1460 1461 if (blk == AMDGPU_RAS_BLOCK_COUNT) 1462 return -EINVAL; 1463 1464 if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY) 1465 return -EINVAL; 1466 1467 if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1468 return amdgpu_virt_req_ras_err_count(adev, blk, err_data); 1469 } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { 1470 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { 1471 amdgpu_ras_get_ecc_info(adev, err_data); 1472 } else { 1473 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); 1474 if (!block_obj || !block_obj->hw_ops) { 1475 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1476 get_ras_block_str(&info->head)); 1477 return -EINVAL; 1478 } 1479 1480 if (block_obj->hw_ops->query_ras_error_count) 1481 block_obj->hw_ops->query_ras_error_count(adev, err_data); 1482 1483 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || 1484 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || 1485 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { 1486 if (block_obj->hw_ops->query_ras_error_status) 1487 block_obj->hw_ops->query_ras_error_status(adev); 1488 } 1489 } 1490 } else { 1491 if (amdgpu_aca_is_enabled(adev)) { 1492 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx); 1493 if (ret) 1494 return ret; 1495 1496 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx); 1497 if (ret) 1498 return ret; 1499 1500 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx); 1501 if (ret) 1502 return ret; 1503 } else { 1504 /* FIXME: add code to check return value later */ 1505 amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx); 1506 amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx); 1507 } 1508 } 1509 1510 return 0; 1511 } 1512 1513 /* query/inject/cure begin */ 1514 static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, 1515 struct ras_query_if *info, 1516 enum ras_event_type type) 1517 { 1518 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1519 struct ras_err_data err_data; 1520 struct ras_query_context qctx; 1521 unsigned int error_query_mode; 1522 int ret; 1523 1524 if (!obj) 1525 return -EINVAL; 1526 1527 ret = amdgpu_ras_error_data_init(&err_data); 1528 if (ret) 1529 return ret; 1530 1531 if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) 1532 return -EINVAL; 1533 1534 memset(&qctx, 0, sizeof(qctx)); 1535 qctx.evid.type = type; 1536 qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type); 1537 1538 if (!down_read_trylock(&adev->reset_domain->sem)) { 1539 ret = -EIO; 1540 goto out_fini_err_data; 1541 } 1542 1543 ret = amdgpu_ras_query_error_status_helper(adev, info, 1544 &err_data, 1545 &qctx, 1546 error_query_mode); 1547 up_read(&adev->reset_domain->sem); 1548 if (ret) 1549 goto out_fini_err_data; 1550 1551 if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1552 amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); 1553 amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); 1554 } else { 1555 /* Host provides absolute error counts. First generate the report 1556 * using the previous VF internal count against new host count. 1557 * Then Update VF internal count. 1558 */ 1559 amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx); 1560 amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data); 1561 } 1562 1563 info->ue_count = obj->err_data.ue_count; 1564 info->ce_count = obj->err_data.ce_count; 1565 info->de_count = obj->err_data.de_count; 1566 1567 out_fini_err_data: 1568 amdgpu_ras_error_data_fini(&err_data); 1569 1570 return ret; 1571 } 1572 1573 static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev) 1574 { 1575 struct ras_cmd_dev_handle req = {0}; 1576 int ret; 1577 1578 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CLEAR_BAD_PAGE_INFO, 1579 &req, sizeof(req), NULL, 0); 1580 if (ret) { 1581 dev_err(adev->dev, "Failed to clear bad pages info, ret: %d\n", ret); 1582 return ret; 1583 } 1584 1585 return 0; 1586 } 1587 1588 static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev, 1589 struct ras_query_if *info) 1590 { 1591 struct ras_cmd_block_ecc_info_req req = {0}; 1592 struct ras_cmd_block_ecc_info_rsp rsp = {0}; 1593 int ret; 1594 1595 if (!info) 1596 return -EINVAL; 1597 1598 req.block_id = info->head.block; 1599 req.subblock_id = info->head.sub_block_index; 1600 1601 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS, 1602 &req, sizeof(req), &rsp, sizeof(rsp)); 1603 if (!ret) { 1604 info->ce_count = rsp.ce_count; 1605 info->ue_count = rsp.ue_count; 1606 info->de_count = rsp.de_count; 1607 } 1608 1609 return ret; 1610 } 1611 1612 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) 1613 { 1614 if (amdgpu_uniras_enabled(adev)) 1615 return amdgpu_uniras_query_block_ecc(adev, info); 1616 else 1617 return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); 1618 } 1619 1620 int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, 1621 enum amdgpu_ras_block block) 1622 { 1623 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 1624 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 1625 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 1626 1627 if (!block_obj || !block_obj->hw_ops) { 1628 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1629 ras_block_str(block)); 1630 return -EOPNOTSUPP; 1631 } 1632 1633 if (!amdgpu_ras_is_supported(adev, block) || 1634 !amdgpu_ras_get_aca_debug_mode(adev)) 1635 return -EOPNOTSUPP; 1636 1637 if (amdgpu_sriov_vf(adev)) 1638 return -EOPNOTSUPP; 1639 1640 /* skip ras error reset in gpu reset */ 1641 if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && 1642 ((smu_funcs && smu_funcs->set_debug_mode) || 1643 (mca_funcs && mca_funcs->mca_set_debug_mode))) 1644 return -EOPNOTSUPP; 1645 1646 if (block_obj->hw_ops->reset_ras_error_count) 1647 block_obj->hw_ops->reset_ras_error_count(adev); 1648 1649 return 0; 1650 } 1651 1652 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, 1653 enum amdgpu_ras_block block) 1654 { 1655 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 1656 1657 if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) 1658 return 0; 1659 1660 if ((block == AMDGPU_RAS_BLOCK__GFX) || 1661 (block == AMDGPU_RAS_BLOCK__MMHUB)) { 1662 if (block_obj->hw_ops->reset_ras_error_status) 1663 block_obj->hw_ops->reset_ras_error_status(adev); 1664 } 1665 1666 return 0; 1667 } 1668 1669 static int amdgpu_uniras_error_inject(struct amdgpu_device *adev, 1670 struct ras_inject_if *info) 1671 { 1672 struct ras_cmd_inject_error_req inject_req; 1673 struct ras_cmd_inject_error_rsp rsp; 1674 1675 if (!info) 1676 return -EINVAL; 1677 1678 memset(&inject_req, 0, sizeof(inject_req)); 1679 inject_req.block_id = info->head.block; 1680 inject_req.subblock_id = info->head.sub_block_index; 1681 inject_req.address = info->address; 1682 inject_req.error_type = info->head.type; 1683 inject_req.instance_mask = info->instance_mask; 1684 inject_req.method = info->value; 1685 1686 return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR, 1687 &inject_req, sizeof(inject_req), &rsp, sizeof(rsp)); 1688 } 1689 1690 /* wrapper of psp_ras_trigger_error */ 1691 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 1692 struct ras_inject_if *info) 1693 { 1694 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1695 struct ta_ras_trigger_error_input block_info = { 1696 .block_id = amdgpu_ras_block_to_ta(info->head.block), 1697 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 1698 .sub_block_index = info->head.sub_block_index, 1699 .address = info->address, 1700 .value = info->value, 1701 }; 1702 int ret = -EINVAL; 1703 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, 1704 info->head.block, 1705 info->head.sub_block_index); 1706 1707 if (amdgpu_uniras_enabled(adev)) 1708 return amdgpu_uniras_error_inject(adev, info); 1709 1710 /* inject on guest isn't allowed, return success directly */ 1711 if (amdgpu_sriov_vf(adev)) 1712 return 0; 1713 1714 if (!obj) 1715 return -EINVAL; 1716 1717 if (!block_obj || !block_obj->hw_ops) { 1718 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1719 get_ras_block_str(&info->head)); 1720 return -EINVAL; 1721 } 1722 1723 /* Calculate XGMI relative offset */ 1724 if (adev->gmc.xgmi.num_physical_nodes > 1 && 1725 info->head.block != AMDGPU_RAS_BLOCK__GFX) { 1726 block_info.address = 1727 amdgpu_xgmi_get_relative_phy_addr(adev, 1728 block_info.address); 1729 } 1730 1731 if (block_obj->hw_ops->ras_error_inject) { 1732 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) 1733 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); 1734 else /* Special ras_error_inject is defined (e.g: xgmi) */ 1735 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, 1736 info->instance_mask); 1737 } else { 1738 /* default path */ 1739 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); 1740 } 1741 1742 if (ret) 1743 dev_err(adev->dev, "ras inject %s failed %d\n", 1744 get_ras_block_str(&info->head), ret); 1745 1746 return ret; 1747 } 1748 1749 /** 1750 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP 1751 * @adev: pointer to AMD GPU device 1752 * @ce_count: pointer to an integer to be set to the count of correctible errors. 1753 * @ue_count: pointer to an integer to be set to the count of uncorrectible errors. 1754 * @query_info: pointer to ras_query_if 1755 * 1756 * Return 0 for query success or do nothing, otherwise return an error 1757 * on failures 1758 */ 1759 static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, 1760 unsigned long *ce_count, 1761 unsigned long *ue_count, 1762 struct ras_query_if *query_info) 1763 { 1764 int ret; 1765 1766 if (!query_info) 1767 /* do nothing if query_info is not specified */ 1768 return 0; 1769 1770 ret = amdgpu_ras_query_error_status(adev, query_info); 1771 if (ret) 1772 return ret; 1773 1774 *ce_count += query_info->ce_count; 1775 *ue_count += query_info->ue_count; 1776 1777 /* some hardware/IP supports read to clear 1778 * no need to explictly reset the err status after the query call */ 1779 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 1780 amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 1781 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) 1782 dev_warn(adev->dev, 1783 "Failed to reset error counter and error status\n"); 1784 } 1785 1786 return 0; 1787 } 1788 1789 /** 1790 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP 1791 * @adev: pointer to AMD GPU device 1792 * @ce_count: pointer to an integer to be set to the count of correctible errors. 1793 * @ue_count: pointer to an integer to be set to the count of uncorrectible 1794 * errors. 1795 * @query_info: pointer to ras_query_if if the query request is only for 1796 * specific ip block; if info is NULL, then the qurey request is for 1797 * all the ip blocks that support query ras error counters/status 1798 * 1799 * If set, @ce_count or @ue_count, count and return the corresponding 1800 * error counts in those integer pointers. Return 0 if the device 1801 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS. 1802 */ 1803 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 1804 unsigned long *ce_count, 1805 unsigned long *ue_count, 1806 struct ras_query_if *query_info) 1807 { 1808 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1809 struct ras_manager *obj; 1810 unsigned long ce, ue; 1811 int ret; 1812 1813 if (!adev->ras_enabled || !con) 1814 return -EOPNOTSUPP; 1815 1816 /* Don't count since no reporting. 1817 */ 1818 if (!ce_count && !ue_count) 1819 return 0; 1820 1821 ce = 0; 1822 ue = 0; 1823 if (!query_info) { 1824 /* query all the ip blocks that support ras query interface */ 1825 list_for_each_entry(obj, &con->head, node) { 1826 struct ras_query_if info = { 1827 .head = obj->head, 1828 }; 1829 1830 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); 1831 } 1832 } else { 1833 /* query specific ip block */ 1834 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); 1835 } 1836 1837 if (ret) 1838 return ret; 1839 1840 if (ce_count) 1841 *ce_count = ce; 1842 1843 if (ue_count) 1844 *ue_count = ue; 1845 1846 return 0; 1847 } 1848 /* query/inject/cure end */ 1849 1850 1851 /* sysfs begin */ 1852 1853 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1854 struct ras_badpage *bps, uint32_t count, uint32_t start); 1855 static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, 1856 struct ras_badpage *bps, uint32_t count, uint32_t start); 1857 1858 static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 1859 { 1860 switch (flags) { 1861 case AMDGPU_RAS_RETIRE_PAGE_RESERVED: 1862 return "R"; 1863 case AMDGPU_RAS_RETIRE_PAGE_PENDING: 1864 return "P"; 1865 case AMDGPU_RAS_RETIRE_PAGE_FAULT: 1866 default: 1867 return "F"; 1868 } 1869 } 1870 1871 /** 1872 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface 1873 * 1874 * It allows user to read the bad pages of vram on the gpu through 1875 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 1876 * 1877 * It outputs multiple lines, and each line stands for one gpu page. 1878 * 1879 * The format of one line is below, 1880 * gpu pfn : gpu page size : flags 1881 * 1882 * gpu pfn and gpu page size are printed in hex format. 1883 * flags can be one of below character, 1884 * 1885 * R: reserved, this gpu page is reserved and not able to use. 1886 * 1887 * P: pending for reserve, this gpu page is marked as bad, will be reserved 1888 * in next window of page_reserve. 1889 * 1890 * F: unable to reserve. this gpu page can't be reserved due to some reasons. 1891 * 1892 * Examples: 1893 * 1894 * .. code-block:: bash 1895 * 1896 * 0x00000001 : 0x00001000 : R 1897 * 0x00000002 : 0x00001000 : P 1898 * 1899 */ 1900 1901 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 1902 struct kobject *kobj, const struct bin_attribute *attr, 1903 char *buf, loff_t ppos, size_t count) 1904 { 1905 struct amdgpu_ras *con = 1906 container_of(attr, struct amdgpu_ras, badpages_attr); 1907 struct amdgpu_device *adev = con->adev; 1908 const unsigned int element_size = 1909 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 1910 unsigned int start = div64_ul(ppos + element_size - 1, element_size); 1911 unsigned int end = div64_ul(ppos + count - 1, element_size); 1912 ssize_t s = 0; 1913 struct ras_badpage *bps = NULL; 1914 int bps_count = 0, i, status; 1915 uint64_t address; 1916 1917 memset(buf, 0, count); 1918 1919 bps_count = end - start; 1920 bps = kmalloc_objs(*bps, bps_count); 1921 if (!bps) 1922 return 0; 1923 1924 memset(bps, 0, sizeof(*bps) * bps_count); 1925 1926 if (amdgpu_uniras_enabled(adev)) 1927 bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start); 1928 else 1929 bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start); 1930 1931 if (bps_count <= 0) { 1932 kfree(bps); 1933 return 0; 1934 } 1935 1936 for (i = 0; i < bps_count; i++) { 1937 address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT; 1938 1939 bps[i].size = AMDGPU_GPU_PAGE_SIZE; 1940 1941 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, 1942 address); 1943 if (status == -EBUSY) 1944 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; 1945 else if (status == -ENOENT) 1946 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; 1947 else 1948 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; 1949 1950 if ((bps[i].flags != AMDGPU_RAS_RETIRE_PAGE_RESERVED) && 1951 amdgpu_ras_check_critical_address(adev, address)) 1952 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; 1953 1954 s += scnprintf(&buf[s], element_size + 1, 1955 "0x%08x : 0x%08x : %1s\n", 1956 bps[i].bp, 1957 bps[i].size, 1958 amdgpu_ras_badpage_flags_str(bps[i].flags)); 1959 } 1960 1961 kfree(bps); 1962 1963 return s; 1964 } 1965 1966 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 1967 struct device_attribute *attr, char *buf) 1968 { 1969 struct amdgpu_ras *con = 1970 container_of(attr, struct amdgpu_ras, features_attr); 1971 1972 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); 1973 } 1974 1975 static bool amdgpu_ras_get_version_info(struct amdgpu_device *adev, u32 *major, 1976 u32 *minor, u32 *rev) 1977 { 1978 int i; 1979 1980 if (!adev || !major || !minor || !rev || !amdgpu_uniras_enabled(adev)) 1981 return false; 1982 1983 for (i = 0; i < adev->num_ip_blocks; i++) { 1984 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_RAS) { 1985 *major = adev->ip_blocks[i].version->major; 1986 *minor = adev->ip_blocks[i].version->minor; 1987 *rev = adev->ip_blocks[i].version->rev; 1988 return true; 1989 } 1990 } 1991 1992 return false; 1993 } 1994 1995 static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev, 1996 struct device_attribute *attr, char *buf) 1997 { 1998 struct amdgpu_ras *con = 1999 container_of(attr, struct amdgpu_ras, version_attr); 2000 u32 major, minor, rev; 2001 ssize_t size = 0; 2002 2003 size += sysfs_emit_at(buf, size, "table version: 0x%x\n", 2004 con->eeprom_control.tbl_hdr.version); 2005 2006 if (amdgpu_ras_get_version_info(con->adev, &major, &minor, &rev)) 2007 size += sysfs_emit_at(buf, size, "ras version: %u.%u.%u\n", 2008 major, minor, rev); 2009 2010 return size; 2011 } 2012 2013 static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, 2014 struct device_attribute *attr, char *buf) 2015 { 2016 struct amdgpu_ras *con = 2017 container_of(attr, struct amdgpu_ras, schema_attr); 2018 return sysfs_emit(buf, "schema: 0x%x\n", con->schema); 2019 } 2020 2021 static struct { 2022 enum ras_event_type type; 2023 const char *name; 2024 } dump_event[] = { 2025 {RAS_EVENT_TYPE_FATAL, "Fatal Error"}, 2026 {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"}, 2027 {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, 2028 }; 2029 2030 static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev, 2031 struct device_attribute *attr, char *buf) 2032 { 2033 struct amdgpu_ras *con = 2034 container_of(attr, struct amdgpu_ras, event_state_attr); 2035 struct ras_event_manager *event_mgr = con->event_mgr; 2036 struct ras_event_state *event_state; 2037 int i, size = 0; 2038 2039 if (!event_mgr) 2040 return -EINVAL; 2041 2042 size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); 2043 for (i = 0; i < ARRAY_SIZE(dump_event); i++) { 2044 event_state = &event_mgr->event_state[dump_event[i].type]; 2045 size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n", 2046 dump_event[i].name, 2047 atomic64_read(&event_state->count), 2048 event_state->last_seqno); 2049 } 2050 2051 return (ssize_t)size; 2052 } 2053 2054 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) 2055 { 2056 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2057 2058 if (adev->dev->kobj.sd) 2059 sysfs_remove_file_from_group(&adev->dev->kobj, 2060 &con->badpages_attr.attr, 2061 RAS_FS_NAME); 2062 } 2063 2064 static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev) 2065 { 2066 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2067 struct attribute *attrs[] = { 2068 &con->features_attr.attr, 2069 &con->version_attr.attr, 2070 &con->schema_attr.attr, 2071 &con->event_state_attr.attr, 2072 NULL 2073 }; 2074 struct attribute_group group = { 2075 .name = RAS_FS_NAME, 2076 .attrs = attrs, 2077 }; 2078 2079 if (adev->dev->kobj.sd) 2080 sysfs_remove_group(&adev->dev->kobj, &group); 2081 2082 return 0; 2083 } 2084 2085 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 2086 struct ras_common_if *head) 2087 { 2088 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2089 2090 if (amdgpu_aca_is_enabled(adev)) 2091 return 0; 2092 2093 if (!obj || obj->attr_inuse) 2094 return -EINVAL; 2095 2096 if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block)) 2097 return 0; 2098 2099 get_obj(obj); 2100 2101 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), 2102 "%s_err_count", head->name); 2103 2104 obj->sysfs_attr = (struct device_attribute){ 2105 .attr = { 2106 .name = obj->fs_data.sysfs_name, 2107 .mode = S_IRUGO, 2108 }, 2109 .show = amdgpu_ras_sysfs_read, 2110 }; 2111 sysfs_attr_init(&obj->sysfs_attr.attr); 2112 2113 if (sysfs_add_file_to_group(&adev->dev->kobj, 2114 &obj->sysfs_attr.attr, 2115 RAS_FS_NAME)) { 2116 put_obj(obj); 2117 return -EINVAL; 2118 } 2119 2120 obj->attr_inuse = 1; 2121 2122 return 0; 2123 } 2124 2125 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 2126 struct ras_common_if *head) 2127 { 2128 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2129 2130 if (amdgpu_aca_is_enabled(adev)) 2131 return 0; 2132 2133 if (!obj || !obj->attr_inuse) 2134 return -EINVAL; 2135 2136 if (adev->dev->kobj.sd) 2137 sysfs_remove_file_from_group(&adev->dev->kobj, 2138 &obj->sysfs_attr.attr, 2139 RAS_FS_NAME); 2140 obj->attr_inuse = 0; 2141 put_obj(obj); 2142 2143 return 0; 2144 } 2145 2146 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 2147 { 2148 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2149 struct ras_manager *obj, *tmp; 2150 2151 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2152 amdgpu_ras_sysfs_remove(adev, &obj->head); 2153 } 2154 2155 if (amdgpu_bad_page_threshold != 0) 2156 amdgpu_ras_sysfs_remove_bad_page_node(adev); 2157 2158 amdgpu_ras_sysfs_remove_dev_attr_node(adev); 2159 2160 return 0; 2161 } 2162 /* sysfs end */ 2163 2164 /** 2165 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors 2166 * 2167 * Normally when there is an uncorrectable error, the driver will reset 2168 * the GPU to recover. However, in the event of an unrecoverable error, 2169 * the driver provides an interface to reboot the system automatically 2170 * in that event. 2171 * 2172 * The following file in debugfs provides that interface: 2173 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot 2174 * 2175 * Usage: 2176 * 2177 * .. code-block:: bash 2178 * 2179 * echo true > .../ras/auto_reboot 2180 * 2181 */ 2182 /* debugfs begin */ 2183 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 2184 { 2185 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2186 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; 2187 struct drm_minor *minor = adev_to_drm(adev)->primary; 2188 struct dentry *dir; 2189 2190 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); 2191 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, 2192 &amdgpu_ras_debugfs_ctrl_ops); 2193 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, 2194 &amdgpu_ras_debugfs_eeprom_ops); 2195 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, 2196 &con->bad_page_cnt_threshold); 2197 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); 2198 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); 2199 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); 2200 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, 2201 &amdgpu_ras_debugfs_eeprom_size_ops); 2202 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", 2203 S_IRUGO, dir, adev, 2204 &amdgpu_ras_debugfs_eeprom_table_ops); 2205 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); 2206 2207 /* 2208 * After one uncorrectable error happens, usually GPU recovery will 2209 * be scheduled. But due to the known problem in GPU recovery failing 2210 * to bring GPU back, below interface provides one direct way to 2211 * user to reboot system automatically in such case within 2212 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine 2213 * will never be called. 2214 */ 2215 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); 2216 2217 /* 2218 * User could set this not to clean up hardware's error count register 2219 * of RAS IPs during ras recovery. 2220 */ 2221 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, 2222 &con->disable_ras_err_cnt_harvest); 2223 return dir; 2224 } 2225 2226 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 2227 struct ras_fs_if *head, 2228 struct dentry *dir) 2229 { 2230 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 2231 2232 if (!obj || !dir) 2233 return; 2234 2235 get_obj(obj); 2236 2237 memcpy(obj->fs_data.debugfs_name, 2238 head->debugfs_name, 2239 sizeof(obj->fs_data.debugfs_name)); 2240 2241 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, 2242 obj, &amdgpu_ras_debugfs_ops); 2243 } 2244 2245 static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) 2246 { 2247 bool ret; 2248 2249 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 2250 case IP_VERSION(13, 0, 6): 2251 case IP_VERSION(13, 0, 12): 2252 case IP_VERSION(13, 0, 14): 2253 case IP_VERSION(13, 0, 15): 2254 ret = true; 2255 break; 2256 default: 2257 ret = false; 2258 break; 2259 } 2260 2261 return ret; 2262 } 2263 2264 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) 2265 { 2266 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2267 struct dentry *dir; 2268 struct ras_manager *obj; 2269 struct ras_fs_if fs_info; 2270 2271 /* 2272 * it won't be called in resume path, no need to check 2273 * suspend and gpu reset status 2274 */ 2275 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) 2276 return; 2277 2278 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); 2279 2280 list_for_each_entry(obj, &con->head, node) { 2281 if (amdgpu_ras_is_supported(adev, obj->head.block) && 2282 (obj->attr_inuse == 1)) { 2283 sprintf(fs_info.debugfs_name, "%s_err_inject", 2284 get_ras_block_str(&obj->head)); 2285 fs_info.head = obj->head; 2286 amdgpu_ras_debugfs_create(adev, &fs_info, dir); 2287 } 2288 } 2289 2290 if (amdgpu_ras_aca_is_supported(adev)) { 2291 if (amdgpu_aca_is_enabled(adev)) 2292 amdgpu_aca_smu_debugfs_init(adev, dir); 2293 else 2294 amdgpu_mca_smu_debugfs_init(adev, dir); 2295 } 2296 } 2297 2298 /* debugfs end */ 2299 2300 /* ras fs */ 2301 static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, 2302 amdgpu_ras_sysfs_badpages_read, NULL, 0); 2303 static DEVICE_ATTR(features, S_IRUGO, 2304 amdgpu_ras_sysfs_features_read, NULL); 2305 static DEVICE_ATTR(version, 0444, 2306 amdgpu_ras_sysfs_version_show, NULL); 2307 static DEVICE_ATTR(schema, 0444, 2308 amdgpu_ras_sysfs_schema_show, NULL); 2309 static DEVICE_ATTR(event_state, 0444, 2310 amdgpu_ras_sysfs_event_state_show, NULL); 2311 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 2312 { 2313 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2314 struct attribute_group group = { 2315 .name = RAS_FS_NAME, 2316 }; 2317 struct attribute *attrs[] = { 2318 &con->features_attr.attr, 2319 &con->version_attr.attr, 2320 &con->schema_attr.attr, 2321 &con->event_state_attr.attr, 2322 NULL 2323 }; 2324 const struct bin_attribute *bin_attrs[] = { 2325 NULL, 2326 NULL, 2327 }; 2328 int r; 2329 2330 group.attrs = attrs; 2331 2332 /* add features entry */ 2333 con->features_attr = dev_attr_features; 2334 sysfs_attr_init(attrs[0]); 2335 2336 /* add version entry */ 2337 con->version_attr = dev_attr_version; 2338 sysfs_attr_init(attrs[1]); 2339 2340 /* add schema entry */ 2341 con->schema_attr = dev_attr_schema; 2342 sysfs_attr_init(attrs[2]); 2343 2344 /* add event_state entry */ 2345 con->event_state_attr = dev_attr_event_state; 2346 sysfs_attr_init(attrs[3]); 2347 2348 if (amdgpu_bad_page_threshold != 0) { 2349 /* add bad_page_features entry */ 2350 con->badpages_attr = bin_attr_gpu_vram_bad_pages; 2351 sysfs_bin_attr_init(&con->badpages_attr); 2352 bin_attrs[0] = &con->badpages_attr; 2353 group.bin_attrs = bin_attrs; 2354 } 2355 2356 r = sysfs_create_group(&adev->dev->kobj, &group); 2357 if (r) 2358 dev_err(adev->dev, "Failed to create RAS sysfs group!"); 2359 2360 return 0; 2361 } 2362 2363 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 2364 { 2365 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2366 struct ras_manager *con_obj, *ip_obj, *tmp; 2367 2368 if (IS_ENABLED(CONFIG_DEBUG_FS)) { 2369 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { 2370 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); 2371 if (ip_obj) 2372 put_obj(ip_obj); 2373 } 2374 } 2375 2376 amdgpu_ras_sysfs_remove_all(adev); 2377 return 0; 2378 } 2379 /* ras fs end */ 2380 2381 /* ih begin */ 2382 2383 /* For the hardware that cannot enable bif ring for both ras_controller_irq 2384 * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status 2385 * register to check whether the interrupt is triggered or not, and properly 2386 * ack the interrupt if it is there 2387 */ 2388 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) 2389 { 2390 /* Fatal error events are handled on host side */ 2391 if (amdgpu_sriov_vf(adev)) 2392 return; 2393 /* 2394 * If the current interrupt is caused by a non-fatal RAS error, skip 2395 * check for fatal error. For fatal errors, FED status of all devices 2396 * in XGMI hive gets set when the first device gets fatal error 2397 * interrupt. The error gets propagated to other devices as well, so 2398 * make sure to ack the interrupt regardless of FED status. 2399 */ 2400 if (!amdgpu_ras_get_fed_status(adev) && 2401 amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) 2402 return; 2403 2404 if (amdgpu_uniras_enabled(adev)) { 2405 amdgpu_ras_mgr_handle_fatal_interrupt(adev, NULL); 2406 return; 2407 } 2408 2409 if (adev->nbio.ras && 2410 adev->nbio.ras->handle_ras_controller_intr_no_bifring) 2411 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); 2412 2413 if (adev->nbio.ras && 2414 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) 2415 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); 2416 } 2417 2418 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, 2419 struct amdgpu_iv_entry *entry) 2420 { 2421 bool poison_stat = false; 2422 struct amdgpu_device *adev = obj->adev; 2423 struct amdgpu_ras_block_object *block_obj = 2424 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); 2425 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2426 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; 2427 u64 event_id; 2428 int ret; 2429 2430 if (!block_obj || !con) 2431 return; 2432 2433 ret = amdgpu_ras_mark_ras_event(adev, type); 2434 if (ret) 2435 return; 2436 2437 amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); 2438 /* both query_poison_status and handle_poison_consumption are optional, 2439 * but at least one of them should be implemented if we need poison 2440 * consumption handler 2441 */ 2442 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { 2443 poison_stat = block_obj->hw_ops->query_poison_status(adev); 2444 if (!poison_stat) { 2445 /* Not poison consumption interrupt, no need to handle it */ 2446 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", 2447 block_obj->ras_comm.name); 2448 2449 return; 2450 } 2451 } 2452 2453 amdgpu_umc_poison_handler(adev, obj->head.block, 0); 2454 2455 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) 2456 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); 2457 2458 /* gpu reset is fallback for failed and default cases. 2459 * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. 2460 */ 2461 if (poison_stat && !amdgpu_ras_is_rma(adev)) { 2462 event_id = amdgpu_ras_acquire_event_id(adev, type); 2463 RAS_EVENT_LOG(adev, event_id, 2464 "GPU reset for %s RAS poison consumption is issued!\n", 2465 block_obj->ras_comm.name); 2466 amdgpu_ras_reset_gpu(adev); 2467 } 2468 2469 if (!poison_stat) 2470 amdgpu_gfx_poison_consumption_handler(adev, entry); 2471 } 2472 2473 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, 2474 struct amdgpu_iv_entry *entry) 2475 { 2476 struct amdgpu_device *adev = obj->adev; 2477 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 2478 u64 event_id; 2479 int ret; 2480 2481 ret = amdgpu_ras_mark_ras_event(adev, type); 2482 if (ret) 2483 return; 2484 2485 event_id = amdgpu_ras_acquire_event_id(adev, type); 2486 RAS_EVENT_LOG(adev, event_id, "Poison is created\n"); 2487 2488 if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { 2489 struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); 2490 2491 atomic_inc(&con->page_retirement_req_cnt); 2492 atomic_inc(&con->poison_creation_count); 2493 2494 wake_up(&con->page_retirement_wq); 2495 } 2496 } 2497 2498 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, 2499 struct amdgpu_iv_entry *entry) 2500 { 2501 struct ras_ih_data *data = &obj->ih_data; 2502 struct ras_err_data err_data; 2503 int ret; 2504 2505 if (!data->cb) 2506 return; 2507 2508 ret = amdgpu_ras_error_data_init(&err_data); 2509 if (ret) 2510 return; 2511 2512 /* Let IP handle its data, maybe we need get the output 2513 * from the callback to update the error type/count, etc 2514 */ 2515 amdgpu_ras_set_fed(obj->adev, true); 2516 ret = data->cb(obj->adev, &err_data, entry); 2517 /* ue will trigger an interrupt, and in that case 2518 * we need do a reset to recovery the whole system. 2519 * But leave IP do that recovery, here we just dispatch 2520 * the error. 2521 */ 2522 if (ret == AMDGPU_RAS_SUCCESS) { 2523 /* these counts could be left as 0 if 2524 * some blocks do not count error number 2525 */ 2526 obj->err_data.ue_count += err_data.ue_count; 2527 obj->err_data.ce_count += err_data.ce_count; 2528 obj->err_data.de_count += err_data.de_count; 2529 } 2530 2531 amdgpu_ras_error_data_fini(&err_data); 2532 } 2533 2534 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 2535 { 2536 struct ras_ih_data *data = &obj->ih_data; 2537 struct amdgpu_iv_entry entry; 2538 2539 while (data->rptr != data->wptr) { 2540 rmb(); 2541 memcpy(&entry, &data->ring[data->rptr], 2542 data->element_size); 2543 2544 wmb(); 2545 data->rptr = (data->aligned_element_size + 2546 data->rptr) % data->ring_size; 2547 2548 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { 2549 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 2550 amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); 2551 else 2552 amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); 2553 } else { 2554 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 2555 amdgpu_ras_interrupt_umc_handler(obj, &entry); 2556 else 2557 dev_warn(obj->adev->dev, 2558 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); 2559 } 2560 } 2561 } 2562 2563 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 2564 { 2565 struct ras_ih_data *data = 2566 container_of(work, struct ras_ih_data, ih_work); 2567 struct ras_manager *obj = 2568 container_of(data, struct ras_manager, ih_data); 2569 2570 amdgpu_ras_interrupt_handler(obj); 2571 } 2572 2573 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 2574 struct ras_dispatch_if *info) 2575 { 2576 struct ras_manager *obj; 2577 struct ras_ih_data *data; 2578 2579 if (amdgpu_uniras_enabled(adev)) { 2580 struct ras_ih_info ih_info; 2581 2582 memset(&ih_info, 0, sizeof(ih_info)); 2583 ih_info.block = info->head.block; 2584 memcpy(&ih_info.iv_entry, info->entry, sizeof(struct amdgpu_iv_entry)); 2585 2586 return amdgpu_ras_mgr_handle_controller_interrupt(adev, &ih_info); 2587 } 2588 2589 obj = amdgpu_ras_find_obj(adev, &info->head); 2590 if (!obj) 2591 return -EINVAL; 2592 2593 data = &obj->ih_data; 2594 2595 if (data->inuse == 0) 2596 return 0; 2597 2598 /* Might be overflow... */ 2599 memcpy(&data->ring[data->wptr], info->entry, 2600 data->element_size); 2601 2602 wmb(); 2603 data->wptr = (data->aligned_element_size + 2604 data->wptr) % data->ring_size; 2605 2606 schedule_work(&data->ih_work); 2607 2608 return 0; 2609 } 2610 2611 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 2612 struct ras_common_if *head) 2613 { 2614 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2615 struct ras_ih_data *data; 2616 2617 if (!obj) 2618 return -EINVAL; 2619 2620 data = &obj->ih_data; 2621 if (data->inuse == 0) 2622 return 0; 2623 2624 cancel_work_sync(&data->ih_work); 2625 2626 kfree(data->ring); 2627 memset(data, 0, sizeof(*data)); 2628 put_obj(obj); 2629 2630 return 0; 2631 } 2632 2633 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 2634 struct ras_common_if *head) 2635 { 2636 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2637 struct ras_ih_data *data; 2638 struct amdgpu_ras_block_object *ras_obj; 2639 2640 if (!obj) { 2641 /* in case we registe the IH before enable ras feature */ 2642 obj = amdgpu_ras_create_obj(adev, head); 2643 if (!obj) 2644 return -EINVAL; 2645 } else 2646 get_obj(obj); 2647 2648 ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); 2649 2650 data = &obj->ih_data; 2651 /* add the callback.etc */ 2652 *data = (struct ras_ih_data) { 2653 .inuse = 0, 2654 .cb = ras_obj->ras_cb, 2655 .element_size = sizeof(struct amdgpu_iv_entry), 2656 .rptr = 0, 2657 .wptr = 0, 2658 }; 2659 2660 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 2661 2662 data->aligned_element_size = ALIGN(data->element_size, 8); 2663 /* the ring can store 64 iv entries. */ 2664 data->ring_size = 64 * data->aligned_element_size; 2665 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 2666 if (!data->ring) { 2667 put_obj(obj); 2668 return -ENOMEM; 2669 } 2670 2671 /* IH is ready */ 2672 data->inuse = 1; 2673 2674 return 0; 2675 } 2676 2677 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 2678 { 2679 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2680 struct ras_manager *obj, *tmp; 2681 2682 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2683 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); 2684 } 2685 2686 return 0; 2687 } 2688 /* ih end */ 2689 2690 /* traversal all IPs except NBIO to query error counter */ 2691 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type) 2692 { 2693 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2694 struct ras_manager *obj; 2695 2696 if (!adev->ras_enabled || !con) 2697 return; 2698 2699 list_for_each_entry(obj, &con->head, node) { 2700 struct ras_query_if info = { 2701 .head = obj->head, 2702 }; 2703 2704 /* 2705 * PCIE_BIF IP has one different isr by ras controller 2706 * interrupt, the specific ras counter query will be 2707 * done in that isr. So skip such block from common 2708 * sync flood interrupt isr calling. 2709 */ 2710 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) 2711 continue; 2712 2713 /* 2714 * this is a workaround for aldebaran, skip send msg to 2715 * smu to get ecc_info table due to smu handle get ecc 2716 * info table failed temporarily. 2717 * should be removed until smu fix handle ecc_info table. 2718 */ 2719 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && 2720 (amdgpu_ip_version(adev, MP1_HWIP, 0) == 2721 IP_VERSION(13, 0, 2))) 2722 continue; 2723 2724 amdgpu_ras_query_error_status_with_event(adev, &info, type); 2725 2726 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != 2727 IP_VERSION(11, 0, 2) && 2728 amdgpu_ip_version(adev, MP0_HWIP, 0) != 2729 IP_VERSION(11, 0, 4) && 2730 amdgpu_ip_version(adev, MP0_HWIP, 0) != 2731 IP_VERSION(13, 0, 0)) { 2732 if (amdgpu_ras_reset_error_status(adev, info.head.block)) 2733 dev_warn(adev->dev, "Failed to reset error counter and error status"); 2734 } 2735 } 2736 } 2737 2738 /* Parse RdRspStatus and WrRspStatus */ 2739 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, 2740 struct ras_query_if *info) 2741 { 2742 struct amdgpu_ras_block_object *block_obj; 2743 /* 2744 * Only two block need to query read/write 2745 * RspStatus at current state 2746 */ 2747 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && 2748 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) 2749 return; 2750 2751 block_obj = amdgpu_ras_get_ras_block(adev, 2752 info->head.block, 2753 info->head.sub_block_index); 2754 2755 if (!block_obj || !block_obj->hw_ops) { 2756 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 2757 get_ras_block_str(&info->head)); 2758 return; 2759 } 2760 2761 if (block_obj->hw_ops->query_ras_error_status) 2762 block_obj->hw_ops->query_ras_error_status(adev); 2763 2764 } 2765 2766 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) 2767 { 2768 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2769 struct ras_manager *obj; 2770 2771 if (!adev->ras_enabled || !con) 2772 return; 2773 2774 list_for_each_entry(obj, &con->head, node) { 2775 struct ras_query_if info = { 2776 .head = obj->head, 2777 }; 2778 2779 amdgpu_ras_error_status_query(adev, &info); 2780 } 2781 } 2782 2783 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 2784 struct ras_badpage *bps, uint32_t count, uint32_t start) 2785 { 2786 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2787 struct ras_err_handler_data *data; 2788 int r = 0; 2789 uint32_t i; 2790 2791 if (!con || !con->eh_data || !bps || !count) 2792 return -EINVAL; 2793 2794 mutex_lock(&con->recovery_lock); 2795 data = con->eh_data; 2796 if (start < data->count) { 2797 for (i = start; i < data->count; i++) { 2798 if (!data->bps[i].ts) 2799 continue; 2800 2801 /* U64_MAX is used to mark the record as invalid */ 2802 if (data->bps[i].retired_page == U64_MAX) 2803 continue; 2804 2805 bps[r].bp = data->bps[i].retired_page; 2806 r++; 2807 if (r >= count) 2808 break; 2809 } 2810 } 2811 mutex_unlock(&con->recovery_lock); 2812 2813 return r; 2814 } 2815 2816 static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, 2817 struct ras_badpage *bps, uint32_t count, uint32_t start) 2818 { 2819 struct ras_cmd_bad_pages_info_req cmd_input; 2820 struct ras_cmd_bad_pages_info_rsp *output; 2821 uint32_t group, start_group, end_group; 2822 uint32_t pos, pos_in_group; 2823 int r = 0, i; 2824 2825 if (!bps || !count) 2826 return -EINVAL; 2827 2828 output = kmalloc_obj(*output); 2829 if (!output) 2830 return -ENOMEM; 2831 2832 memset(&cmd_input, 0, sizeof(cmd_input)); 2833 2834 start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2835 end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) / 2836 RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2837 2838 pos = start; 2839 for (group = start_group; group < end_group; group++) { 2840 memset(output, 0, sizeof(*output)); 2841 cmd_input.group_index = group; 2842 if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES, 2843 &cmd_input, sizeof(cmd_input), output, sizeof(*output))) 2844 goto out; 2845 2846 if (pos >= output->bp_total_cnt) 2847 goto out; 2848 2849 pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2850 for (i = pos_in_group; i < output->bp_in_group; i++, pos++) { 2851 if (!output->records[i].ts) 2852 continue; 2853 2854 bps[r].bp = output->records[i].retired_page; 2855 r++; 2856 if (r >= count) 2857 goto out; 2858 } 2859 } 2860 2861 out: 2862 kfree(output); 2863 return r; 2864 } 2865 2866 static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, 2867 struct amdgpu_hive_info *hive, bool status) 2868 { 2869 struct amdgpu_device *tmp_adev; 2870 2871 if (hive) { 2872 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 2873 amdgpu_ras_set_fed(tmp_adev, status); 2874 } else { 2875 amdgpu_ras_set_fed(adev, status); 2876 } 2877 } 2878 2879 bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) 2880 { 2881 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2882 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 2883 int hive_ras_recovery = 0; 2884 2885 if (hive) { 2886 hive_ras_recovery = atomic_read(&hive->ras_recovery); 2887 amdgpu_put_xgmi_hive(hive); 2888 } 2889 2890 if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 2891 return true; 2892 2893 return false; 2894 } 2895 2896 static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev) 2897 { 2898 if (amdgpu_ras_intr_triggered()) 2899 return RAS_EVENT_TYPE_FATAL; 2900 else 2901 return RAS_EVENT_TYPE_POISON_CONSUMPTION; 2902 } 2903 2904 static void amdgpu_ras_do_recovery(struct work_struct *work) 2905 { 2906 struct amdgpu_ras *ras = 2907 container_of(work, struct amdgpu_ras, recovery_work); 2908 struct amdgpu_device *remote_adev = NULL; 2909 struct amdgpu_device *adev = ras->adev; 2910 struct list_head device_list, *device_list_handle = NULL; 2911 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2912 unsigned int error_query_mode; 2913 enum ras_event_type type; 2914 2915 if (hive) { 2916 atomic_set(&hive->ras_recovery, 1); 2917 2918 /* If any device which is part of the hive received RAS fatal 2919 * error interrupt, set fatal error status on all. This 2920 * condition will need a recovery, and flag will be cleared 2921 * as part of recovery. 2922 */ 2923 list_for_each_entry(remote_adev, &hive->device_list, 2924 gmc.xgmi.head) 2925 if (amdgpu_ras_get_fed_status(remote_adev)) { 2926 amdgpu_ras_set_fed_all(adev, hive, true); 2927 break; 2928 } 2929 } 2930 if (!ras->disable_ras_err_cnt_harvest) { 2931 2932 /* Build list of devices to query RAS related errors */ 2933 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { 2934 device_list_handle = &hive->device_list; 2935 } else { 2936 INIT_LIST_HEAD(&device_list); 2937 list_add_tail(&adev->gmc.xgmi.head, &device_list); 2938 device_list_handle = &device_list; 2939 } 2940 2941 if (amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) { 2942 if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) { 2943 /* wait 500ms to ensure pmfw polling mca bank info done */ 2944 msleep(500); 2945 } 2946 } 2947 2948 type = amdgpu_ras_get_fatal_error_event(adev); 2949 list_for_each_entry(remote_adev, 2950 device_list_handle, gmc.xgmi.head) { 2951 if (amdgpu_uniras_enabled(remote_adev)) { 2952 amdgpu_ras_mgr_update_ras_ecc(remote_adev); 2953 } else { 2954 amdgpu_ras_query_err_status(remote_adev); 2955 amdgpu_ras_log_on_err_counter(remote_adev, type); 2956 } 2957 } 2958 2959 } 2960 2961 if (amdgpu_device_should_recover_gpu(ras->adev)) { 2962 struct amdgpu_reset_context reset_context; 2963 memset(&reset_context, 0, sizeof(reset_context)); 2964 2965 reset_context.method = AMD_RESET_METHOD_NONE; 2966 reset_context.reset_req_dev = adev; 2967 reset_context.src = AMDGPU_RESET_SRC_RAS; 2968 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 2969 2970 /* Perform full reset in fatal error mode */ 2971 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) 2972 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2973 else { 2974 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2975 2976 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { 2977 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; 2978 reset_context.method = AMD_RESET_METHOD_MODE2; 2979 } 2980 2981 /* Fatal error occurs in poison mode, mode1 reset is used to 2982 * recover gpu. 2983 */ 2984 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { 2985 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; 2986 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2987 2988 psp_fatal_error_recovery_quirk(&adev->psp); 2989 } 2990 } 2991 2992 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); 2993 } 2994 atomic_set(&ras->in_recovery, 0); 2995 if (hive) { 2996 atomic_set(&hive->ras_recovery, 0); 2997 amdgpu_put_xgmi_hive(hive); 2998 } 2999 } 3000 3001 /* alloc/realloc bps array */ 3002 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 3003 struct ras_err_handler_data *data, int pages) 3004 { 3005 unsigned int old_space = data->count + data->space_left; 3006 unsigned int new_space = old_space + pages; 3007 unsigned int align_space = ALIGN(new_space, 512); 3008 void *bps = kmalloc_objs(*data->bps, align_space); 3009 3010 if (!bps) { 3011 return -ENOMEM; 3012 } 3013 3014 if (data->bps) { 3015 memcpy(bps, data->bps, 3016 data->count * sizeof(*data->bps)); 3017 kfree(data->bps); 3018 } 3019 3020 data->bps = bps; 3021 data->space_left += align_space - old_space; 3022 return 0; 3023 } 3024 3025 static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, 3026 struct eeprom_table_record *bps, 3027 struct ras_err_data *err_data) 3028 { 3029 struct ta_ras_query_address_input addr_in; 3030 uint32_t socket = 0; 3031 int ret = 0; 3032 3033 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 3034 socket = adev->smuio.funcs->get_socket_id(adev); 3035 3036 /* reinit err_data */ 3037 err_data->err_addr_cnt = 0; 3038 err_data->err_addr_len = adev->umc.retire_unit; 3039 3040 memset(&addr_in, 0, sizeof(addr_in)); 3041 addr_in.ma.err_addr = bps->address; 3042 addr_in.ma.socket_id = socket; 3043 addr_in.ma.ch_inst = bps->mem_channel; 3044 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 3045 /* tell RAS TA the node instance is not used */ 3046 addr_in.ma.node_inst = TA_RAS_INV_NODE; 3047 } else { 3048 addr_in.ma.umc_inst = bps->mcumc_id; 3049 addr_in.ma.node_inst = bps->cu; 3050 } 3051 3052 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3053 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, 3054 &addr_in, NULL, false); 3055 3056 return ret; 3057 } 3058 3059 static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, 3060 struct eeprom_table_record *bps, 3061 struct ras_err_data *err_data) 3062 { 3063 struct ta_ras_query_address_input addr_in; 3064 uint32_t die_id, socket = 0; 3065 3066 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 3067 socket = adev->smuio.funcs->get_socket_id(adev); 3068 3069 /* although die id is gotten from PA in nps1 mode, the id is 3070 * fitable for any nps mode 3071 */ 3072 if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) 3073 die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, 3074 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); 3075 else 3076 return -EINVAL; 3077 3078 /* reinit err_data */ 3079 err_data->err_addr_cnt = 0; 3080 err_data->err_addr_len = adev->umc.retire_unit; 3081 3082 memset(&addr_in, 0, sizeof(addr_in)); 3083 addr_in.ma.err_addr = bps->address; 3084 addr_in.ma.ch_inst = bps->mem_channel; 3085 addr_in.ma.umc_inst = bps->mcumc_id; 3086 addr_in.ma.node_inst = die_id; 3087 addr_in.ma.socket_id = socket; 3088 3089 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3090 return adev->umc.ras->convert_ras_err_addr(adev, err_data, 3091 &addr_in, NULL, false); 3092 else 3093 return -EINVAL; 3094 } 3095 3096 static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, 3097 struct eeprom_table_record *bps, int count) 3098 { 3099 int j; 3100 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3101 struct ras_err_handler_data *data = con->eh_data; 3102 3103 for (j = 0; j < count; j++) { 3104 if (!data->space_left && 3105 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 3106 return -ENOMEM; 3107 } 3108 3109 if (amdgpu_ras_check_bad_page_unlock(con, 3110 bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { 3111 /* set to U64_MAX to mark it as invalid */ 3112 data->bps[data->count].retired_page = U64_MAX; 3113 data->count++; 3114 data->space_left--; 3115 continue; 3116 } 3117 3118 amdgpu_ras_reserve_page(adev, bps[j].retired_page); 3119 3120 memcpy(&data->bps[data->count], &(bps[j]), 3121 sizeof(struct eeprom_table_record)); 3122 data->count++; 3123 data->space_left--; 3124 con->bad_page_num++; 3125 } 3126 3127 return 0; 3128 } 3129 3130 static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, 3131 struct eeprom_table_record *bps, struct ras_err_data *err_data, 3132 enum amdgpu_memory_partition nps) 3133 { 3134 int i = 0; 3135 uint64_t chan_idx_v2; 3136 enum amdgpu_memory_partition save_nps; 3137 3138 save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; 3139 chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2; 3140 3141 /*old asics just have pa in eeprom*/ 3142 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { 3143 memcpy(err_data->err_addr, bps, 3144 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); 3145 goto out; 3146 } 3147 3148 for (i = 0; i < adev->umc.retire_unit; i++) 3149 bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); 3150 3151 if (save_nps || chan_idx_v2) { 3152 if (save_nps == nps) { 3153 if (amdgpu_umc_pages_in_a_row(adev, err_data, 3154 bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 3155 return -EINVAL; 3156 for (i = 0; i < adev->umc.retire_unit; i++) { 3157 err_data->err_addr[i].address = bps[0].address; 3158 err_data->err_addr[i].mem_channel = bps[0].mem_channel; 3159 err_data->err_addr[i].bank = bps[0].bank; 3160 err_data->err_addr[i].err_type = bps[0].err_type; 3161 err_data->err_addr[i].mcumc_id = bps[0].mcumc_id; 3162 } 3163 } else { 3164 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) 3165 return -EINVAL; 3166 } 3167 } else { 3168 if (bps[0].address == 0) { 3169 /* for specific old eeprom data, mca address is not stored, 3170 * calc it from pa 3171 */ 3172 if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, 3173 &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) 3174 return -EINVAL; 3175 } 3176 3177 if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { 3178 if (nps == AMDGPU_NPS1_PARTITION_MODE) 3179 memcpy(err_data->err_addr, bps, 3180 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); 3181 else 3182 return -EOPNOTSUPP; 3183 } 3184 } 3185 3186 out: 3187 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); 3188 } 3189 3190 static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, 3191 struct eeprom_table_record *bps, struct ras_err_data *err_data, 3192 enum amdgpu_memory_partition nps) 3193 { 3194 int i = 0; 3195 uint64_t chan_idx_v2; 3196 enum amdgpu_memory_partition save_nps; 3197 3198 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 3199 save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; 3200 chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2; 3201 bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); 3202 } else { 3203 /* if pmfw manages eeprom, save_nps is not stored on eeprom, 3204 * we should always convert mca address into physical address, 3205 * make save_nps different from nps 3206 */ 3207 save_nps = nps + 1; 3208 } 3209 3210 if (save_nps == nps) { 3211 if (amdgpu_umc_pages_in_a_row(adev, err_data, 3212 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) 3213 return -EINVAL; 3214 for (i = 0; i < adev->umc.retire_unit; i++) { 3215 err_data->err_addr[i].address = bps->address; 3216 err_data->err_addr[i].mem_channel = bps->mem_channel; 3217 err_data->err_addr[i].bank = bps->bank; 3218 err_data->err_addr[i].err_type = bps->err_type; 3219 err_data->err_addr[i].mcumc_id = bps->mcumc_id; 3220 } 3221 } else { 3222 if (save_nps || chan_idx_v2) { 3223 if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) 3224 return -EINVAL; 3225 } else { 3226 /* for specific old eeprom data, mca address is not stored, 3227 * calc it from pa 3228 */ 3229 if (bps->address == 0) 3230 if (amdgpu_umc_pa2mca(adev, 3231 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, 3232 &(bps->address), 3233 AMDGPU_NPS1_PARTITION_MODE)) 3234 return -EINVAL; 3235 3236 if (amdgpu_ras_mca2pa(adev, bps, err_data)) 3237 return -EOPNOTSUPP; 3238 } 3239 } 3240 3241 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, 3242 adev->umc.retire_unit); 3243 } 3244 3245 /* it deal with vram only. */ 3246 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 3247 struct eeprom_table_record *bps, int pages, bool from_rom) 3248 { 3249 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3250 struct ras_err_data err_data; 3251 struct amdgpu_ras_eeprom_control *control = 3252 &adev->psp.ras_context.ras->eeprom_control; 3253 enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; 3254 int ret = 0; 3255 uint32_t i = 0; 3256 3257 if (!con || !con->eh_data || !bps || pages <= 0) 3258 return 0; 3259 3260 if (from_rom) { 3261 err_data.err_addr = 3262 kzalloc_objs(struct eeprom_table_record, 3263 adev->umc.retire_unit); 3264 if (!err_data.err_addr) { 3265 dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); 3266 return -ENOMEM; 3267 } 3268 3269 if (adev->gmc.gmc_funcs->query_mem_partition_mode) 3270 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 3271 } 3272 3273 mutex_lock(&con->recovery_lock); 3274 3275 if (from_rom) { 3276 /* there is no pa recs in V3, so skip pa recs processing */ 3277 if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && 3278 !amdgpu_ras_smu_eeprom_supported(adev)) { 3279 for (i = 0; i < pages; i++) { 3280 if (control->ras_num_recs - i >= adev->umc.retire_unit) { 3281 if ((bps[i].address == bps[i + 1].address) && 3282 (bps[i].mem_channel == bps[i + 1].mem_channel)) { 3283 /* deal with retire_unit records a time */ 3284 ret = __amdgpu_ras_convert_rec_array_from_rom(adev, 3285 &bps[i], &err_data, nps); 3286 i += (adev->umc.retire_unit - 1); 3287 } else { 3288 break; 3289 } 3290 } else { 3291 break; 3292 } 3293 } 3294 } 3295 for (; i < pages; i++) { 3296 ret = __amdgpu_ras_convert_rec_from_rom(adev, 3297 &bps[i], &err_data, nps); 3298 } 3299 3300 con->eh_data->count_saved = con->eh_data->count; 3301 } else { 3302 ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); 3303 } 3304 3305 if (from_rom) 3306 kfree(err_data.err_addr); 3307 mutex_unlock(&con->recovery_lock); 3308 3309 return ret; 3310 } 3311 3312 /* 3313 * write error record array to eeprom, the function should be 3314 * protected by recovery_lock 3315 * new_cnt: new added UE count, excluding reserved bad pages, can be NULL 3316 */ 3317 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, 3318 unsigned long *new_cnt) 3319 { 3320 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3321 struct ras_err_handler_data *data; 3322 struct amdgpu_ras_eeprom_control *control; 3323 int save_count, unit_num, i; 3324 3325 if (!con || !con->eh_data) { 3326 if (new_cnt) 3327 *new_cnt = 0; 3328 3329 return 0; 3330 } 3331 3332 if (!con->eeprom_control.is_eeprom_valid) { 3333 dev_warn(adev->dev, 3334 "Failed to save EEPROM table data because of EEPROM data corruption!"); 3335 if (new_cnt) 3336 *new_cnt = 0; 3337 3338 return 0; 3339 } 3340 3341 mutex_lock(&con->recovery_lock); 3342 control = &con->eeprom_control; 3343 data = con->eh_data; 3344 if (amdgpu_ras_smu_eeprom_supported(adev)) 3345 unit_num = control->ras_num_recs - 3346 control->ras_num_recs_old; 3347 else 3348 unit_num = data->count / adev->umc.retire_unit - 3349 control->ras_num_recs; 3350 3351 save_count = con->bad_page_num - control->ras_num_bad_pages; 3352 mutex_unlock(&con->recovery_lock); 3353 3354 if (new_cnt) 3355 *new_cnt = unit_num; 3356 3357 /* only new entries are saved */ 3358 if (unit_num && save_count) { 3359 /*old asics only save pa to eeprom like before*/ 3360 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { 3361 if (amdgpu_ras_eeprom_append(control, 3362 &data->bps[data->count_saved], unit_num)) { 3363 dev_err(adev->dev, "Failed to save EEPROM table data!"); 3364 return -EIO; 3365 } 3366 } else { 3367 for (i = 0; i < unit_num; i++) { 3368 if (amdgpu_ras_eeprom_append(control, 3369 &data->bps[data->count_saved + 3370 i * adev->umc.retire_unit], 1)) { 3371 dev_err(adev->dev, "Failed to save EEPROM table data!"); 3372 return -EIO; 3373 } 3374 } 3375 } 3376 3377 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); 3378 data->count_saved = data->count; 3379 } 3380 3381 return 0; 3382 } 3383 3384 /* 3385 * read error record array in eeprom and reserve enough space for 3386 * storing new bad pages 3387 */ 3388 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 3389 { 3390 struct amdgpu_ras_eeprom_control *control = 3391 &adev->psp.ras_context.ras->eeprom_control; 3392 struct eeprom_table_record *bps; 3393 int ret, i = 0; 3394 3395 /* no bad page record, skip eeprom access */ 3396 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) 3397 return 0; 3398 3399 bps = kzalloc_objs(*bps, control->ras_num_recs); 3400 if (!bps) 3401 return -ENOMEM; 3402 3403 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); 3404 if (ret) { 3405 dev_err(adev->dev, "Failed to load EEPROM table records!"); 3406 } else { 3407 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { 3408 /*In V3, there is no pa recs, and some cases(when address==0) may be parsed 3409 as pa recs, so add verion check to avoid it. 3410 */ 3411 if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && 3412 !amdgpu_ras_smu_eeprom_supported(adev)) { 3413 for (i = 0; i < control->ras_num_recs; i++) { 3414 if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { 3415 if ((bps[i].address == bps[i + 1].address) && 3416 (bps[i].mem_channel == bps[i + 1].mem_channel)) { 3417 control->ras_num_pa_recs += adev->umc.retire_unit; 3418 i += (adev->umc.retire_unit - 1); 3419 } else { 3420 control->ras_num_mca_recs += 3421 (control->ras_num_recs - i); 3422 break; 3423 } 3424 } else { 3425 control->ras_num_mca_recs += (control->ras_num_recs - i); 3426 break; 3427 } 3428 } 3429 } else { 3430 control->ras_num_mca_recs = control->ras_num_recs; 3431 } 3432 } 3433 3434 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); 3435 if (ret) 3436 goto out; 3437 3438 ret = amdgpu_ras_eeprom_check(control); 3439 if (ret) 3440 goto out; 3441 3442 /* HW not usable */ 3443 if (amdgpu_ras_is_rma(adev)) 3444 ret = -EHWPOISON; 3445 } 3446 3447 out: 3448 kfree(bps); 3449 return ret; 3450 } 3451 3452 static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 3453 uint64_t addr) 3454 { 3455 struct ras_err_handler_data *data = con->eh_data; 3456 struct amdgpu_device *adev = con->adev; 3457 int i; 3458 3459 if ((addr >= adev->gmc.mc_vram_size && 3460 adev->gmc.mc_vram_size) || 3461 (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) 3462 return -EINVAL; 3463 3464 addr >>= AMDGPU_GPU_PAGE_SHIFT; 3465 for (i = 0; i < data->count; i++) 3466 if (addr == data->bps[i].retired_page) 3467 return 1; 3468 3469 return 0; 3470 } 3471 3472 /* 3473 * check if an address belongs to bad page 3474 * 3475 * Note: this check is only for umc block 3476 */ 3477 static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 3478 uint64_t addr) 3479 { 3480 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3481 int ret = 0; 3482 3483 if (!con || !con->eh_data) 3484 return ret; 3485 3486 mutex_lock(&con->recovery_lock); 3487 ret = amdgpu_ras_check_bad_page_unlock(con, addr); 3488 mutex_unlock(&con->recovery_lock); 3489 return ret; 3490 } 3491 3492 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, 3493 uint32_t max_count) 3494 { 3495 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3496 3497 /* 3498 * amdgpu_bad_page_threshold is used to config 3499 * the threshold for the number of bad pages. 3500 * -1: Threshold is set to default value 3501 * Driver will issue a warning message when threshold is reached 3502 * and continue runtime services. 3503 * 0: Disable bad page retirement 3504 * Driver will not retire bad pages 3505 * which is intended for debugging purpose. 3506 * -2: Threshold is determined by a formula 3507 * that assumes 1 bad page per 100M of local memory. 3508 * Driver will continue runtime services when threhold is reached. 3509 * 0 < threshold < max number of bad page records in EEPROM, 3510 * A user-defined threshold is set 3511 * Driver will halt runtime services when this custom threshold is reached. 3512 */ 3513 if (amdgpu_bad_page_threshold == -2) { 3514 u64 val = adev->gmc.mc_vram_size; 3515 3516 do_div(val, RAS_BAD_PAGE_COVER); 3517 con->bad_page_cnt_threshold = min(lower_32_bits(val), 3518 max_count); 3519 } else if (amdgpu_bad_page_threshold == -1) { 3520 con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4; 3521 } else { 3522 con->bad_page_cnt_threshold = min_t(int, max_count, 3523 amdgpu_bad_page_threshold); 3524 } 3525 } 3526 3527 int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, 3528 enum amdgpu_ras_block block, uint16_t pasid, 3529 pasid_notify pasid_fn, void *data, uint32_t reset) 3530 { 3531 int ret = 0; 3532 struct ras_poison_msg poison_msg; 3533 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3534 3535 memset(&poison_msg, 0, sizeof(poison_msg)); 3536 poison_msg.block = block; 3537 poison_msg.pasid = pasid; 3538 poison_msg.reset = reset; 3539 poison_msg.pasid_fn = pasid_fn; 3540 poison_msg.data = data; 3541 3542 ret = kfifo_put(&con->poison_fifo, poison_msg); 3543 if (!ret) { 3544 dev_err(adev->dev, "Poison message fifo is full!\n"); 3545 return -ENOSPC; 3546 } 3547 3548 return 0; 3549 } 3550 3551 static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, 3552 struct ras_poison_msg *poison_msg) 3553 { 3554 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3555 3556 return kfifo_get(&con->poison_fifo, poison_msg); 3557 } 3558 3559 static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) 3560 { 3561 mutex_init(&ecc_log->lock); 3562 3563 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); 3564 ecc_log->de_queried_count = 0; 3565 ecc_log->consumption_q_count = 0; 3566 } 3567 3568 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) 3569 { 3570 struct radix_tree_iter iter; 3571 void __rcu **slot; 3572 struct ras_ecc_err *ecc_err; 3573 3574 mutex_lock(&ecc_log->lock); 3575 radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { 3576 ecc_err = radix_tree_deref_slot(slot); 3577 kfree(ecc_err->err_pages.pfn); 3578 kfree(ecc_err); 3579 radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); 3580 } 3581 mutex_unlock(&ecc_log->lock); 3582 3583 mutex_destroy(&ecc_log->lock); 3584 ecc_log->de_queried_count = 0; 3585 ecc_log->consumption_q_count = 0; 3586 } 3587 3588 static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, 3589 uint32_t delayed_ms) 3590 { 3591 int ret; 3592 3593 mutex_lock(&con->umc_ecc_log.lock); 3594 ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, 3595 UMC_ECC_NEW_DETECTED_TAG); 3596 mutex_unlock(&con->umc_ecc_log.lock); 3597 3598 if (ret) 3599 schedule_delayed_work(&con->page_retirement_dwork, 3600 msecs_to_jiffies(delayed_ms)); 3601 3602 return ret ? true : false; 3603 } 3604 3605 static void amdgpu_ras_do_page_retirement(struct work_struct *work) 3606 { 3607 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 3608 page_retirement_dwork.work); 3609 struct amdgpu_device *adev = con->adev; 3610 struct ras_err_data err_data; 3611 3612 /* If gpu reset is ongoing, delay retiring the bad pages */ 3613 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { 3614 amdgpu_ras_schedule_retirement_dwork(con, 3615 AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3); 3616 return; 3617 } 3618 3619 amdgpu_ras_error_data_init(&err_data); 3620 3621 amdgpu_umc_handle_bad_pages(adev, &err_data); 3622 3623 amdgpu_ras_error_data_fini(&err_data); 3624 3625 amdgpu_ras_schedule_retirement_dwork(con, 3626 AMDGPU_RAS_RETIRE_PAGE_INTERVAL); 3627 } 3628 3629 static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, 3630 uint32_t poison_creation_count) 3631 { 3632 int ret = 0; 3633 struct ras_ecc_log_info *ecc_log; 3634 struct ras_query_if info; 3635 u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; 3636 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 3637 u64 de_queried_count; 3638 u64 consumption_q_count; 3639 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 3640 3641 memset(&info, 0, sizeof(info)); 3642 info.head.block = AMDGPU_RAS_BLOCK__UMC; 3643 3644 ecc_log = &ras->umc_ecc_log; 3645 ecc_log->de_queried_count = 0; 3646 ecc_log->consumption_q_count = 0; 3647 3648 do { 3649 ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); 3650 if (ret) 3651 return ret; 3652 3653 de_queried_count = ecc_log->de_queried_count; 3654 consumption_q_count = ecc_log->consumption_q_count; 3655 3656 if (de_queried_count && consumption_q_count) 3657 break; 3658 3659 msleep(100); 3660 } while (--timeout); 3661 3662 if (de_queried_count) 3663 schedule_delayed_work(&ras->page_retirement_dwork, 0); 3664 3665 if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) 3666 amdgpu_ras_reset_gpu(adev); 3667 3668 return 0; 3669 } 3670 3671 static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) 3672 { 3673 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3674 struct ras_poison_msg msg; 3675 int ret; 3676 3677 do { 3678 ret = kfifo_get(&con->poison_fifo, &msg); 3679 } while (ret); 3680 } 3681 3682 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, 3683 uint32_t msg_count, uint32_t *gpu_reset) 3684 { 3685 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3686 uint32_t reset_flags = 0, reset = 0; 3687 struct ras_poison_msg msg; 3688 int ret, i; 3689 3690 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 3691 3692 for (i = 0; i < msg_count; i++) { 3693 ret = amdgpu_ras_get_poison_req(adev, &msg); 3694 if (!ret) 3695 continue; 3696 3697 if (msg.pasid_fn) 3698 msg.pasid_fn(adev, msg.pasid, msg.data); 3699 3700 reset_flags |= msg.reset; 3701 } 3702 3703 /* 3704 * Try to ensure poison creation handler is completed first 3705 * to set rma if bad page exceed threshold. 3706 */ 3707 flush_delayed_work(&con->page_retirement_dwork); 3708 3709 /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ 3710 if (reset_flags && !amdgpu_ras_is_rma(adev)) { 3711 if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) 3712 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 3713 else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) 3714 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 3715 else 3716 reset = reset_flags; 3717 3718 con->gpu_reset_flags |= reset; 3719 amdgpu_ras_reset_gpu(adev); 3720 3721 *gpu_reset = reset; 3722 3723 /* Wait for gpu recovery to complete */ 3724 flush_work(&con->recovery_work); 3725 } 3726 3727 return 0; 3728 } 3729 3730 static int amdgpu_ras_page_retirement_thread(void *param) 3731 { 3732 struct amdgpu_device *adev = (struct amdgpu_device *)param; 3733 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3734 uint32_t poison_creation_count, msg_count; 3735 uint32_t gpu_reset; 3736 int ret; 3737 3738 while (!kthread_should_stop()) { 3739 3740 wait_event_interruptible(con->page_retirement_wq, 3741 kthread_should_stop() || 3742 atomic_read(&con->page_retirement_req_cnt)); 3743 3744 if (kthread_should_stop()) 3745 break; 3746 3747 mutex_lock(&con->poison_lock); 3748 gpu_reset = 0; 3749 3750 do { 3751 poison_creation_count = atomic_read(&con->poison_creation_count); 3752 ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count); 3753 if (ret == -EIO) 3754 break; 3755 3756 if (poison_creation_count) { 3757 atomic_sub(poison_creation_count, &con->poison_creation_count); 3758 atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); 3759 } 3760 } while (atomic_read(&con->poison_creation_count) && 3761 !atomic_read(&con->poison_consumption_count)); 3762 3763 if (ret != -EIO) { 3764 msg_count = kfifo_len(&con->poison_fifo); 3765 if (msg_count) { 3766 ret = amdgpu_ras_poison_consumption_handler(adev, 3767 msg_count, &gpu_reset); 3768 if ((ret != -EIO) && 3769 (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) 3770 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3771 } 3772 } 3773 3774 if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { 3775 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ 3776 /* Clear poison creation request */ 3777 atomic_set(&con->poison_creation_count, 0); 3778 atomic_set(&con->poison_consumption_count, 0); 3779 3780 /* Clear poison fifo */ 3781 amdgpu_ras_clear_poison_fifo(adev); 3782 3783 /* Clear all poison requests */ 3784 atomic_set(&con->page_retirement_req_cnt, 0); 3785 3786 if (ret == -EIO) { 3787 /* Wait for mode-1 reset to complete */ 3788 down_read(&adev->reset_domain->sem); 3789 up_read(&adev->reset_domain->sem); 3790 } 3791 3792 /* Wake up work to save bad pages to eeprom */ 3793 schedule_delayed_work(&con->page_retirement_dwork, 0); 3794 } else if (gpu_reset) { 3795 /* gpu just completed mode-2 reset or other reset */ 3796 /* Clear poison consumption messages cached in fifo */ 3797 msg_count = kfifo_len(&con->poison_fifo); 3798 if (msg_count) { 3799 amdgpu_ras_clear_poison_fifo(adev); 3800 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3801 } 3802 3803 atomic_set(&con->poison_consumption_count, 0); 3804 3805 /* Wake up work to save bad pages to eeprom */ 3806 schedule_delayed_work(&con->page_retirement_dwork, 0); 3807 } 3808 mutex_unlock(&con->poison_lock); 3809 } 3810 3811 return 0; 3812 } 3813 3814 int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) 3815 { 3816 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3817 struct amdgpu_ras_eeprom_control *control; 3818 int ret; 3819 3820 if (!con || amdgpu_sriov_vf(adev)) 3821 return 0; 3822 3823 if (amdgpu_uniras_enabled(adev)) 3824 return 0; 3825 3826 control = &con->eeprom_control; 3827 con->ras_smu_drv = amdgpu_dpm_get_ras_smu_driver(adev); 3828 3829 ret = amdgpu_ras_eeprom_init(control); 3830 control->is_eeprom_valid = !ret; 3831 3832 if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) 3833 control->ras_num_pa_recs = control->ras_num_recs; 3834 3835 if (adev->umc.ras && 3836 adev->umc.ras->get_retire_flip_bits) 3837 adev->umc.ras->get_retire_flip_bits(adev); 3838 3839 if (control->ras_num_recs && control->is_eeprom_valid) { 3840 ret = amdgpu_ras_load_bad_pages(adev); 3841 if (ret) { 3842 control->is_eeprom_valid = false; 3843 return 0; 3844 } 3845 3846 amdgpu_dpm_send_hbm_bad_pages_num( 3847 adev, control->ras_num_bad_pages); 3848 3849 if (con->update_channel_flag == true) { 3850 amdgpu_dpm_send_hbm_bad_channel_flag( 3851 adev, control->bad_channel_bitmap); 3852 con->update_channel_flag = false; 3853 } 3854 3855 /* The format action is only applied to new ASICs */ 3856 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && 3857 control->tbl_hdr.version < RAS_TABLE_VER_V3) 3858 if (!amdgpu_ras_eeprom_reset_table(control)) 3859 if (amdgpu_ras_save_bad_pages(adev, NULL)) 3860 dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); 3861 } 3862 3863 return 0; 3864 } 3865 3866 int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) 3867 { 3868 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3869 struct ras_err_handler_data **data; 3870 u32 max_eeprom_records_count = 0; 3871 int ret; 3872 3873 if (!con || amdgpu_sriov_vf(adev)) 3874 return 0; 3875 3876 /* Allow access to RAS EEPROM via debugfs, when the ASIC 3877 * supports RAS and debugfs is enabled, but when 3878 * adev->ras_enabled is unset, i.e. when "ras_enable" 3879 * module parameter is set to 0. 3880 */ 3881 con->adev = adev; 3882 3883 if (!adev->ras_enabled) 3884 return 0; 3885 3886 data = &con->eh_data; 3887 *data = kzalloc_obj(**data); 3888 if (!*data) { 3889 ret = -ENOMEM; 3890 goto out; 3891 } 3892 3893 mutex_init(&con->recovery_lock); 3894 mutex_init(&con->poison_lock); 3895 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 3896 atomic_set(&con->in_recovery, 0); 3897 atomic_set(&con->rma_in_recovery, 0); 3898 con->eeprom_control.bad_channel_bitmap = 0; 3899 3900 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); 3901 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); 3902 3903 if (init_bp_info) { 3904 ret = amdgpu_ras_init_badpage_info(adev); 3905 if (ret) 3906 goto free; 3907 } 3908 3909 mutex_init(&con->page_rsv_lock); 3910 INIT_KFIFO(con->poison_fifo); 3911 mutex_init(&con->page_retirement_lock); 3912 init_waitqueue_head(&con->page_retirement_wq); 3913 atomic_set(&con->page_retirement_req_cnt, 0); 3914 atomic_set(&con->poison_creation_count, 0); 3915 atomic_set(&con->poison_consumption_count, 0); 3916 con->page_retirement_thread = 3917 kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); 3918 if (IS_ERR(con->page_retirement_thread)) { 3919 con->page_retirement_thread = NULL; 3920 dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); 3921 } 3922 3923 INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); 3924 amdgpu_ras_ecc_log_init(&con->umc_ecc_log); 3925 #ifdef CONFIG_X86_MCE_AMD 3926 if ((adev->asic_type == CHIP_ALDEBARAN) && 3927 (adev->gmc.xgmi.connected_to_cpu)) 3928 amdgpu_register_bad_pages_mca_notifier(adev); 3929 #endif 3930 return 0; 3931 3932 free: 3933 kfree((*data)->bps); 3934 kfree(*data); 3935 con->eh_data = NULL; 3936 out: 3937 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); 3938 3939 /* 3940 * Except error threshold exceeding case, other failure cases in this 3941 * function would not fail amdgpu driver init. 3942 */ 3943 if (!amdgpu_ras_is_rma(adev)) 3944 ret = 0; 3945 else 3946 ret = -EINVAL; 3947 3948 return ret; 3949 } 3950 3951 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 3952 { 3953 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3954 struct ras_err_handler_data *data = con->eh_data; 3955 int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES; 3956 bool ret; 3957 3958 /* recovery_init failed to init it, fini is useless */ 3959 if (!data) 3960 return 0; 3961 3962 /* Save all cached bad pages to eeprom */ 3963 do { 3964 flush_delayed_work(&con->page_retirement_dwork); 3965 ret = amdgpu_ras_schedule_retirement_dwork(con, 0); 3966 } while (ret && max_flush_timeout--); 3967 3968 if (con->page_retirement_thread) 3969 kthread_stop(con->page_retirement_thread); 3970 3971 atomic_set(&con->page_retirement_req_cnt, 0); 3972 atomic_set(&con->poison_creation_count, 0); 3973 3974 mutex_destroy(&con->page_rsv_lock); 3975 3976 cancel_work_sync(&con->recovery_work); 3977 3978 cancel_delayed_work_sync(&con->page_retirement_dwork); 3979 3980 amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); 3981 3982 mutex_lock(&con->recovery_lock); 3983 con->eh_data = NULL; 3984 kfree(data->bps); 3985 kfree(data); 3986 mutex_unlock(&con->recovery_lock); 3987 3988 amdgpu_ras_critical_region_init(adev); 3989 #ifdef CONFIG_X86_MCE_AMD 3990 amdgpu_unregister_bad_pages_mca_notifier(adev); 3991 #endif 3992 return 0; 3993 } 3994 /* recovery end */ 3995 3996 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) 3997 { 3998 if (amdgpu_sriov_vf(adev)) { 3999 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4000 case IP_VERSION(13, 0, 2): 4001 case IP_VERSION(13, 0, 6): 4002 case IP_VERSION(13, 0, 12): 4003 case IP_VERSION(13, 0, 14): 4004 case IP_VERSION(13, 0, 15): 4005 return true; 4006 default: 4007 return false; 4008 } 4009 } 4010 4011 if (adev->asic_type == CHIP_IP_DISCOVERY) { 4012 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4013 case IP_VERSION(13, 0, 0): 4014 case IP_VERSION(13, 0, 6): 4015 case IP_VERSION(13, 0, 10): 4016 case IP_VERSION(13, 0, 12): 4017 case IP_VERSION(13, 0, 14): 4018 case IP_VERSION(13, 0, 15): 4019 case IP_VERSION(14, 0, 3): 4020 return true; 4021 default: 4022 return false; 4023 } 4024 } 4025 4026 return adev->asic_type == CHIP_VEGA10 || 4027 adev->asic_type == CHIP_VEGA20 || 4028 adev->asic_type == CHIP_ARCTURUS || 4029 adev->asic_type == CHIP_ALDEBARAN || 4030 adev->asic_type == CHIP_SIENNA_CICHLID; 4031 } 4032 4033 /* 4034 * this is workaround for vega20 workstation sku, 4035 * force enable gfx ras, ignore vbios gfx ras flag 4036 * due to GC EDC can not write 4037 */ 4038 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) 4039 { 4040 struct atom_context *ctx = adev->mode_info.atom_context; 4041 4042 if (!ctx) 4043 return; 4044 4045 if (strnstr(ctx->vbios_pn, "D16406", 4046 sizeof(ctx->vbios_pn)) || 4047 strnstr(ctx->vbios_pn, "D36002", 4048 sizeof(ctx->vbios_pn))) 4049 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); 4050 } 4051 4052 /* Query ras capablity via atomfirmware interface */ 4053 static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) 4054 { 4055 /* mem_ecc cap */ 4056 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { 4057 dev_info(adev->dev, "MEM ECC is active.\n"); 4058 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | 4059 1 << AMDGPU_RAS_BLOCK__DF); 4060 } else { 4061 dev_info(adev->dev, "MEM ECC is not presented.\n"); 4062 } 4063 4064 /* sram_ecc cap */ 4065 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { 4066 dev_info(adev->dev, "SRAM ECC is active.\n"); 4067 if (!amdgpu_sriov_vf(adev)) 4068 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 4069 1 << AMDGPU_RAS_BLOCK__DF); 4070 else 4071 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | 4072 1 << AMDGPU_RAS_BLOCK__SDMA | 4073 1 << AMDGPU_RAS_BLOCK__GFX); 4074 4075 /* 4076 * VCN/JPEG RAS can be supported on both bare metal and 4077 * SRIOV environment 4078 */ 4079 if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || 4080 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || 4081 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || 4082 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1)) 4083 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 4084 1 << AMDGPU_RAS_BLOCK__JPEG); 4085 else 4086 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | 4087 1 << AMDGPU_RAS_BLOCK__JPEG); 4088 4089 /* 4090 * XGMI RAS is not supported if xgmi num physical nodes 4091 * is zero 4092 */ 4093 if (!adev->gmc.xgmi.num_physical_nodes) 4094 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); 4095 } else { 4096 dev_info(adev->dev, "SRAM ECC is not presented.\n"); 4097 } 4098 } 4099 4100 /* Query poison mode from umc/df IP callbacks */ 4101 static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) 4102 { 4103 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4104 bool df_poison, umc_poison; 4105 4106 /* poison setting is useless on SRIOV guest */ 4107 if (amdgpu_sriov_vf(adev) || !con) 4108 return; 4109 4110 /* Init poison supported flag, the default value is false */ 4111 if (adev->gmc.xgmi.connected_to_cpu || 4112 adev->gmc.is_app_apu) { 4113 /* enabled by default when GPU is connected to CPU */ 4114 con->poison_supported = true; 4115 } else if (adev->df.funcs && 4116 adev->df.funcs->query_ras_poison_mode && 4117 adev->umc.ras && 4118 adev->umc.ras->query_ras_poison_mode) { 4119 df_poison = 4120 adev->df.funcs->query_ras_poison_mode(adev); 4121 umc_poison = 4122 adev->umc.ras->query_ras_poison_mode(adev); 4123 4124 /* Only poison is set in both DF and UMC, we can support it */ 4125 if (df_poison && umc_poison) 4126 con->poison_supported = true; 4127 else if (df_poison != umc_poison) 4128 dev_warn(adev->dev, 4129 "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", 4130 df_poison, umc_poison); 4131 } 4132 } 4133 4134 /* 4135 * check hardware's ras ability which will be saved in hw_supported. 4136 * if hardware does not support ras, we can skip some ras initializtion and 4137 * forbid some ras operations from IP. 4138 * if software itself, say boot parameter, limit the ras ability. We still 4139 * need allow IP do some limited operations, like disable. In such case, 4140 * we have to initialize ras as normal. but need check if operation is 4141 * allowed or not in each function. 4142 */ 4143 static void amdgpu_ras_check_supported(struct amdgpu_device *adev) 4144 { 4145 adev->ras_hw_enabled = adev->ras_enabled = 0; 4146 4147 if (!amdgpu_ras_asic_supported(adev)) 4148 return; 4149 4150 if (amdgpu_sriov_vf(adev)) { 4151 if (amdgpu_virt_get_ras_capability(adev)) 4152 goto init_ras_enabled_flag; 4153 } 4154 4155 /* query ras capability from psp */ 4156 if (amdgpu_psp_get_ras_capability(&adev->psp)) 4157 goto init_ras_enabled_flag; 4158 4159 /* query ras capablity from bios */ 4160 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4161 amdgpu_ras_query_ras_capablity_from_vbios(adev); 4162 } else { 4163 /* driver only manages a few IP blocks RAS feature 4164 * when GPU is connected cpu through XGMI */ 4165 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | 4166 1 << AMDGPU_RAS_BLOCK__SDMA | 4167 1 << AMDGPU_RAS_BLOCK__MMHUB); 4168 } 4169 4170 /* apply asic specific settings (vega20 only for now) */ 4171 amdgpu_ras_get_quirks(adev); 4172 4173 /* query poison mode from umc/df ip callback */ 4174 amdgpu_ras_query_poison_mode(adev); 4175 4176 init_ras_enabled_flag: 4177 /* hw_supported needs to be aligned with RAS block mask. */ 4178 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; 4179 4180 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : 4181 adev->ras_hw_enabled & amdgpu_ras_mask; 4182 4183 /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ 4184 if (!amdgpu_sriov_vf(adev)) { 4185 adev->aca.is_enabled = 4186 (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || 4187 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || 4188 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14) || 4189 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 15)); 4190 } 4191 4192 /* bad page feature is not applicable to specific app platform */ 4193 if (adev->gmc.is_app_apu && 4194 amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) 4195 amdgpu_bad_page_threshold = 0; 4196 } 4197 4198 static void amdgpu_ras_counte_dw(struct work_struct *work) 4199 { 4200 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 4201 ras_counte_delay_work.work); 4202 struct amdgpu_device *adev = con->adev; 4203 struct drm_device *dev = adev_to_drm(adev); 4204 unsigned long ce_count, ue_count; 4205 int res; 4206 4207 res = pm_runtime_get_sync(dev->dev); 4208 if (res < 0) 4209 goto Out; 4210 4211 /* Cache new values. 4212 */ 4213 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { 4214 atomic_set(&con->ras_ce_count, ce_count); 4215 atomic_set(&con->ras_ue_count, ue_count); 4216 } 4217 4218 Out: 4219 pm_runtime_put_autosuspend(dev->dev); 4220 } 4221 4222 static int amdgpu_get_ras_schema(struct amdgpu_device *adev) 4223 { 4224 return amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 | 4225 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE | 4226 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE | 4227 AMDGPU_RAS_ERROR__PARITY; 4228 } 4229 4230 static void ras_event_mgr_init(struct ras_event_manager *mgr) 4231 { 4232 struct ras_event_state *event_state; 4233 int i; 4234 4235 memset(mgr, 0, sizeof(*mgr)); 4236 atomic64_set(&mgr->seqno, 0); 4237 4238 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { 4239 event_state = &mgr->event_state[i]; 4240 event_state->last_seqno = RAS_EVENT_INVALID_ID; 4241 atomic64_set(&event_state->count, 0); 4242 } 4243 } 4244 4245 static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) 4246 { 4247 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4248 struct amdgpu_hive_info *hive; 4249 4250 if (!ras) 4251 return; 4252 4253 hive = amdgpu_get_xgmi_hive(adev); 4254 ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; 4255 4256 /* init event manager with node 0 on xgmi system */ 4257 if (!amdgpu_reset_in_recovery(adev)) { 4258 if (!hive || adev->gmc.xgmi.node_id == 0) 4259 ras_event_mgr_init(ras->event_mgr); 4260 } 4261 4262 if (hive) 4263 amdgpu_put_xgmi_hive(hive); 4264 } 4265 4266 static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) 4267 { 4268 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4269 4270 if (!con || (adev->flags & AMD_IS_APU)) 4271 return; 4272 4273 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4274 case IP_VERSION(13, 0, 2): 4275 case IP_VERSION(13, 0, 6): 4276 case IP_VERSION(13, 0, 12): 4277 case IP_VERSION(13, 0, 15): 4278 con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; 4279 break; 4280 case IP_VERSION(13, 0, 14): 4281 con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1); 4282 break; 4283 default: 4284 break; 4285 } 4286 } 4287 4288 int amdgpu_ras_init(struct amdgpu_device *adev) 4289 { 4290 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4291 int r; 4292 4293 if (con) 4294 return 0; 4295 4296 con = kzalloc(sizeof(*con) + 4297 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT + 4298 sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, 4299 GFP_KERNEL); 4300 if (!con) 4301 return -ENOMEM; 4302 4303 con->adev = adev; 4304 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); 4305 atomic_set(&con->ras_ce_count, 0); 4306 atomic_set(&con->ras_ue_count, 0); 4307 4308 con->objs = (struct ras_manager *)(con + 1); 4309 4310 amdgpu_ras_set_context(adev, con); 4311 4312 amdgpu_ras_check_supported(adev); 4313 4314 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { 4315 /* set gfx block ras context feature for VEGA20 Gaming 4316 * send ras disable cmd to ras ta during ras late init. 4317 */ 4318 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { 4319 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); 4320 4321 return 0; 4322 } 4323 4324 r = 0; 4325 goto release_con; 4326 } 4327 4328 con->update_channel_flag = false; 4329 con->features = 0; 4330 con->schema = 0; 4331 INIT_LIST_HEAD(&con->head); 4332 /* Might need get this flag from vbios. */ 4333 con->flags = RAS_DEFAULT_FLAGS; 4334 4335 /* initialize nbio ras function ahead of any other 4336 * ras functions so hardware fatal error interrupt 4337 * can be enabled as early as possible */ 4338 switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { 4339 case IP_VERSION(7, 4, 0): 4340 case IP_VERSION(7, 4, 1): 4341 case IP_VERSION(7, 4, 4): 4342 if (!adev->gmc.xgmi.connected_to_cpu) 4343 adev->nbio.ras = &nbio_v7_4_ras; 4344 break; 4345 case IP_VERSION(4, 3, 0): 4346 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 4347 /* unlike other generation of nbio ras, 4348 * nbio v4_3 only support fatal error interrupt 4349 * to inform software that DF is freezed due to 4350 * system fatal error event. driver should not 4351 * enable nbio ras in such case. Instead, 4352 * check DF RAS */ 4353 adev->nbio.ras = &nbio_v4_3_ras; 4354 break; 4355 case IP_VERSION(6, 3, 1): 4356 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 4357 /* unlike other generation of nbio ras, 4358 * nbif v6_3_1 only support fatal error interrupt 4359 * to inform software that DF is freezed due to 4360 * system fatal error event. driver should not 4361 * enable nbio ras in such case. Instead, 4362 * check DF RAS 4363 */ 4364 adev->nbio.ras = &nbif_v6_3_1_ras; 4365 break; 4366 case IP_VERSION(7, 9, 0): 4367 case IP_VERSION(7, 9, 1): 4368 if (!adev->gmc.is_app_apu) 4369 adev->nbio.ras = &nbio_v7_9_ras; 4370 break; 4371 default: 4372 /* nbio ras is not available */ 4373 break; 4374 } 4375 4376 /* nbio ras block needs to be enabled ahead of other ras blocks 4377 * to handle fatal error */ 4378 r = amdgpu_nbio_ras_sw_init(adev); 4379 if (r) 4380 goto release_con; 4381 4382 if (adev->nbio.ras && 4383 adev->nbio.ras->init_ras_controller_interrupt) { 4384 r = adev->nbio.ras->init_ras_controller_interrupt(adev); 4385 if (r) 4386 goto release_con; 4387 } 4388 4389 if (adev->nbio.ras && 4390 adev->nbio.ras->init_ras_err_event_athub_interrupt) { 4391 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); 4392 if (r) 4393 goto release_con; 4394 } 4395 4396 /* Packed socket_id to ras feature mask bits[31:29] */ 4397 if (adev->smuio.funcs && 4398 adev->smuio.funcs->get_socket_id) 4399 con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 4400 AMDGPU_RAS_FEATURES_SOCKETID_SHIFT); 4401 4402 /* Get RAS schema for particular SOC */ 4403 con->schema = amdgpu_get_ras_schema(adev); 4404 4405 amdgpu_ras_init_reserved_vram_size(adev); 4406 4407 if (amdgpu_ras_fs_init(adev)) { 4408 r = -EINVAL; 4409 goto release_con; 4410 } 4411 4412 if (amdgpu_ras_aca_is_supported(adev)) { 4413 if (amdgpu_aca_is_enabled(adev)) 4414 r = amdgpu_aca_init(adev); 4415 else 4416 r = amdgpu_mca_init(adev); 4417 if (r) 4418 goto release_con; 4419 } 4420 4421 con->init_task_pid = task_pid_nr(current); 4422 get_task_comm(con->init_task_comm, current); 4423 4424 mutex_init(&con->critical_region_lock); 4425 INIT_LIST_HEAD(&con->critical_region_head); 4426 4427 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " 4428 "hardware ability[%x] ras_mask[%x]\n", 4429 adev->ras_hw_enabled, adev->ras_enabled); 4430 4431 return 0; 4432 release_con: 4433 amdgpu_ras_set_context(adev, NULL); 4434 kfree(con); 4435 4436 return r; 4437 } 4438 4439 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) 4440 { 4441 if (adev->gmc.xgmi.connected_to_cpu || 4442 adev->gmc.is_app_apu) 4443 return 1; 4444 return 0; 4445 } 4446 4447 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, 4448 struct ras_common_if *ras_block) 4449 { 4450 struct ras_query_if info = { 4451 .head = *ras_block, 4452 }; 4453 4454 if (!amdgpu_persistent_edc_harvesting_supported(adev)) 4455 return 0; 4456 4457 if (amdgpu_ras_query_error_status(adev, &info) != 0) 4458 drm_warn(adev_to_drm(adev), "RAS init query failure"); 4459 4460 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) 4461 drm_warn(adev_to_drm(adev), "RAS init harvest reset failure"); 4462 4463 return 0; 4464 } 4465 4466 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) 4467 { 4468 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4469 4470 if (!con) 4471 return false; 4472 4473 return con->poison_supported; 4474 } 4475 4476 /* helper function to handle common stuff in ip late init phase */ 4477 int amdgpu_ras_block_late_init(struct amdgpu_device *adev, 4478 struct ras_common_if *ras_block) 4479 { 4480 struct amdgpu_ras_block_object *ras_obj = NULL; 4481 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4482 struct ras_query_if *query_info; 4483 unsigned long ue_count, ce_count; 4484 int r; 4485 4486 /* disable RAS feature per IP block if it is not supported */ 4487 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { 4488 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); 4489 return 0; 4490 } 4491 4492 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); 4493 if (r) { 4494 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) { 4495 /* in resume phase, if fail to enable ras, 4496 * clean up all ras fs nodes, and disable ras */ 4497 goto cleanup; 4498 } else 4499 return r; 4500 } 4501 4502 /* check for errors on warm reset edc persisant supported ASIC */ 4503 amdgpu_persistent_edc_harvesting(adev, ras_block); 4504 4505 /* in resume phase, no need to create ras fs node */ 4506 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) 4507 return 0; 4508 4509 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 4510 if (ras_obj->ras_cb || (ras_obj->hw_ops && 4511 (ras_obj->hw_ops->query_poison_status || 4512 ras_obj->hw_ops->handle_poison_consumption))) { 4513 r = amdgpu_ras_interrupt_add_handler(adev, ras_block); 4514 if (r) 4515 goto cleanup; 4516 } 4517 4518 if (ras_obj->hw_ops && 4519 (ras_obj->hw_ops->query_ras_error_count || 4520 ras_obj->hw_ops->query_ras_error_status)) { 4521 r = amdgpu_ras_sysfs_create(adev, ras_block); 4522 if (r) 4523 goto interrupt; 4524 4525 /* Those are the cached values at init. 4526 */ 4527 query_info = kzalloc_obj(*query_info); 4528 if (!query_info) 4529 return -ENOMEM; 4530 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); 4531 4532 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { 4533 atomic_set(&con->ras_ce_count, ce_count); 4534 atomic_set(&con->ras_ue_count, ue_count); 4535 } 4536 4537 kfree(query_info); 4538 } 4539 4540 return 0; 4541 4542 interrupt: 4543 if (ras_obj->ras_cb) 4544 amdgpu_ras_interrupt_remove_handler(adev, ras_block); 4545 cleanup: 4546 amdgpu_ras_feature_enable(adev, ras_block, 0); 4547 return r; 4548 } 4549 4550 static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, 4551 struct ras_common_if *ras_block) 4552 { 4553 return amdgpu_ras_block_late_init(adev, ras_block); 4554 } 4555 4556 /* helper function to remove ras fs node and interrupt handler */ 4557 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, 4558 struct ras_common_if *ras_block) 4559 { 4560 struct amdgpu_ras_block_object *ras_obj; 4561 if (!ras_block) 4562 return; 4563 4564 amdgpu_ras_sysfs_remove(adev, ras_block); 4565 4566 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 4567 if (ras_obj->ras_cb) 4568 amdgpu_ras_interrupt_remove_handler(adev, ras_block); 4569 } 4570 4571 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, 4572 struct ras_common_if *ras_block) 4573 { 4574 return amdgpu_ras_block_late_fini(adev, ras_block); 4575 } 4576 4577 /* do some init work after IP late init as dependence. 4578 * and it runs in resume/gpu reset/booting up cases. 4579 */ 4580 void amdgpu_ras_resume(struct amdgpu_device *adev) 4581 { 4582 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4583 struct ras_manager *obj, *tmp; 4584 4585 if (!adev->ras_enabled || !con) { 4586 /* clean ras context for VEGA20 Gaming after send ras disable cmd */ 4587 amdgpu_release_ras_context(adev); 4588 4589 return; 4590 } 4591 4592 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 4593 /* Set up all other IPs which are not implemented. There is a 4594 * tricky thing that IP's actual ras error type should be 4595 * MULTI_UNCORRECTABLE, but as driver does not handle it, so 4596 * ERROR_NONE make sense anyway. 4597 */ 4598 amdgpu_ras_enable_all_features(adev, 1); 4599 4600 /* We enable ras on all hw_supported block, but as boot 4601 * parameter might disable some of them and one or more IP has 4602 * not implemented yet. So we disable them on behalf. 4603 */ 4604 list_for_each_entry_safe(obj, tmp, &con->head, node) { 4605 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 4606 amdgpu_ras_feature_enable(adev, &obj->head, 0); 4607 /* there should be no any reference. */ 4608 WARN_ON(alive_obj(obj)); 4609 } 4610 } 4611 } 4612 } 4613 4614 void amdgpu_ras_suspend(struct amdgpu_device *adev) 4615 { 4616 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4617 4618 if (!adev->ras_enabled || !con) 4619 return; 4620 4621 amdgpu_ras_disable_all_features(adev, 0); 4622 /* Make sure all ras objects are disabled. */ 4623 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4624 amdgpu_ras_disable_all_features(adev, 1); 4625 } 4626 4627 int amdgpu_ras_late_init(struct amdgpu_device *adev) 4628 { 4629 struct amdgpu_ras_block_list *node, *tmp; 4630 struct amdgpu_ras_block_object *obj; 4631 int r; 4632 4633 amdgpu_ras_event_mgr_init(adev); 4634 4635 if (amdgpu_ras_aca_is_supported(adev)) { 4636 if (amdgpu_reset_in_recovery(adev)) { 4637 if (amdgpu_aca_is_enabled(adev)) 4638 r = amdgpu_aca_reset(adev); 4639 else 4640 r = amdgpu_mca_reset(adev); 4641 if (r) 4642 return r; 4643 } 4644 4645 if (!amdgpu_sriov_vf(adev)) { 4646 if (amdgpu_aca_is_enabled(adev)) 4647 amdgpu_ras_set_aca_debug_mode(adev, false); 4648 else 4649 amdgpu_ras_set_mca_debug_mode(adev, false); 4650 } 4651 } 4652 4653 /* Guest side doesn't need init ras feature */ 4654 if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) 4655 return 0; 4656 4657 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 4658 obj = node->ras_obj; 4659 if (!obj) { 4660 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 4661 continue; 4662 } 4663 4664 if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) 4665 continue; 4666 4667 if (obj->ras_late_init) { 4668 r = obj->ras_late_init(adev, &obj->ras_comm); 4669 if (r) { 4670 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", 4671 obj->ras_comm.name, r); 4672 return r; 4673 } 4674 } else 4675 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); 4676 } 4677 4678 amdgpu_ras_check_bad_page_status(adev); 4679 4680 return 0; 4681 } 4682 4683 /* do some fini work before IP fini as dependence */ 4684 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 4685 { 4686 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4687 4688 if (!adev->ras_enabled || !con) 4689 return 0; 4690 4691 4692 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 4693 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4694 amdgpu_ras_disable_all_features(adev, 0); 4695 amdgpu_ras_recovery_fini(adev); 4696 return 0; 4697 } 4698 4699 int amdgpu_ras_fini(struct amdgpu_device *adev) 4700 { 4701 struct amdgpu_ras_block_list *ras_node, *tmp; 4702 struct amdgpu_ras_block_object *obj = NULL; 4703 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4704 4705 if (!adev->ras_enabled || !con) 4706 return 0; 4707 4708 amdgpu_ras_critical_region_fini(adev); 4709 mutex_destroy(&con->critical_region_lock); 4710 4711 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { 4712 if (ras_node->ras_obj) { 4713 obj = ras_node->ras_obj; 4714 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && 4715 obj->ras_fini) 4716 obj->ras_fini(adev, &obj->ras_comm); 4717 else 4718 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); 4719 } 4720 4721 /* Clear ras blocks from ras_list and free ras block list node */ 4722 list_del(&ras_node->node); 4723 kfree(ras_node); 4724 } 4725 4726 amdgpu_ras_fs_fini(adev); 4727 amdgpu_ras_interrupt_remove_all(adev); 4728 4729 if (amdgpu_ras_aca_is_supported(adev)) { 4730 if (amdgpu_aca_is_enabled(adev)) 4731 amdgpu_aca_fini(adev); 4732 else 4733 amdgpu_mca_fini(adev); 4734 } 4735 4736 WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); 4737 4738 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4739 amdgpu_ras_disable_all_features(adev, 0); 4740 4741 cancel_delayed_work_sync(&con->ras_counte_delay_work); 4742 4743 amdgpu_ras_set_context(adev, NULL); 4744 kfree(con); 4745 4746 return 0; 4747 } 4748 4749 bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) 4750 { 4751 struct amdgpu_ras *ras; 4752 4753 ras = amdgpu_ras_get_context(adev); 4754 if (!ras) 4755 return false; 4756 4757 return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4758 } 4759 4760 void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) 4761 { 4762 struct amdgpu_ras *ras; 4763 4764 ras = amdgpu_ras_get_context(adev); 4765 if (ras) { 4766 if (status) 4767 set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4768 else 4769 clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4770 } 4771 } 4772 4773 void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) 4774 { 4775 struct amdgpu_ras *ras; 4776 4777 ras = amdgpu_ras_get_context(adev); 4778 if (ras) { 4779 ras->ras_err_state = 0; 4780 ras->gpu_reset_flags = 0; 4781 } 4782 } 4783 4784 void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, 4785 enum amdgpu_ras_block block) 4786 { 4787 struct amdgpu_ras *ras; 4788 4789 ras = amdgpu_ras_get_context(adev); 4790 if (ras) 4791 set_bit(block, &ras->ras_err_state); 4792 } 4793 4794 bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) 4795 { 4796 struct amdgpu_ras *ras; 4797 4798 ras = amdgpu_ras_get_context(adev); 4799 if (ras) { 4800 if (block == AMDGPU_RAS_BLOCK__ANY) 4801 return (ras->ras_err_state != 0); 4802 else 4803 return test_bit(block, &ras->ras_err_state) || 4804 test_bit(AMDGPU_RAS_BLOCK__LAST, 4805 &ras->ras_err_state); 4806 } 4807 4808 return false; 4809 } 4810 4811 static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) 4812 { 4813 struct amdgpu_ras *ras; 4814 4815 ras = amdgpu_ras_get_context(adev); 4816 if (!ras) 4817 return NULL; 4818 4819 return ras->event_mgr; 4820 } 4821 4822 int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, 4823 const void *caller) 4824 { 4825 struct ras_event_manager *event_mgr; 4826 struct ras_event_state *event_state; 4827 int ret = 0; 4828 4829 if (amdgpu_uniras_enabled(adev)) 4830 return 0; 4831 4832 if (type >= RAS_EVENT_TYPE_COUNT) { 4833 ret = -EINVAL; 4834 goto out; 4835 } 4836 4837 event_mgr = __get_ras_event_mgr(adev); 4838 if (!event_mgr) { 4839 ret = -EINVAL; 4840 goto out; 4841 } 4842 4843 event_state = &event_mgr->event_state[type]; 4844 event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); 4845 atomic64_inc(&event_state->count); 4846 4847 out: 4848 if (ret && caller) 4849 dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", 4850 (int)type, caller, ret); 4851 4852 return ret; 4853 } 4854 4855 u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) 4856 { 4857 struct ras_event_manager *event_mgr; 4858 u64 id; 4859 4860 if (type >= RAS_EVENT_TYPE_COUNT) 4861 return RAS_EVENT_INVALID_ID; 4862 4863 switch (type) { 4864 case RAS_EVENT_TYPE_FATAL: 4865 case RAS_EVENT_TYPE_POISON_CREATION: 4866 case RAS_EVENT_TYPE_POISON_CONSUMPTION: 4867 event_mgr = __get_ras_event_mgr(adev); 4868 if (!event_mgr) 4869 return RAS_EVENT_INVALID_ID; 4870 4871 id = event_mgr->event_state[type].last_seqno; 4872 break; 4873 case RAS_EVENT_TYPE_INVALID: 4874 default: 4875 id = RAS_EVENT_INVALID_ID; 4876 break; 4877 } 4878 4879 return id; 4880 } 4881 4882 int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) 4883 { 4884 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { 4885 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4886 enum ras_event_type type = RAS_EVENT_TYPE_FATAL; 4887 u64 event_id = RAS_EVENT_INVALID_ID; 4888 4889 if (amdgpu_uniras_enabled(adev)) 4890 return 0; 4891 4892 if (!amdgpu_ras_mark_ras_event(adev, type)) 4893 event_id = amdgpu_ras_acquire_event_id(adev, type); 4894 4895 RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" 4896 "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); 4897 4898 amdgpu_ras_set_fed(adev, true); 4899 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 4900 amdgpu_ras_reset_gpu(adev); 4901 } 4902 4903 return -EBUSY; 4904 } 4905 4906 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) 4907 { 4908 if (adev->asic_type == CHIP_VEGA20 && 4909 adev->pm.fw_version <= 0x283400) { 4910 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && 4911 amdgpu_ras_intr_triggered(); 4912 } 4913 4914 return false; 4915 } 4916 4917 void amdgpu_release_ras_context(struct amdgpu_device *adev) 4918 { 4919 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4920 4921 if (!con) 4922 return; 4923 4924 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { 4925 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); 4926 amdgpu_ras_set_context(adev, NULL); 4927 kfree(con); 4928 } 4929 } 4930 4931 #ifdef CONFIG_X86_MCE_AMD 4932 static struct amdgpu_device *find_adev(uint32_t node_id) 4933 { 4934 int i; 4935 struct amdgpu_device *adev = NULL; 4936 4937 for (i = 0; i < mce_adev_list.num_gpu; i++) { 4938 adev = mce_adev_list.devs[i]; 4939 4940 if (adev && adev->gmc.xgmi.connected_to_cpu && 4941 adev->gmc.xgmi.physical_node_id == node_id) 4942 break; 4943 adev = NULL; 4944 } 4945 4946 return adev; 4947 } 4948 4949 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) 4950 #define GET_UMC_INST(m) (((m) >> 21) & 0x7) 4951 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4)) 4952 #define GPU_ID_OFFSET 8 4953 4954 static int amdgpu_bad_page_notifier(struct notifier_block *nb, 4955 unsigned long val, void *data) 4956 { 4957 struct mce *m = (struct mce *)data; 4958 struct amdgpu_device *adev = NULL; 4959 uint32_t gpu_id = 0; 4960 uint32_t umc_inst = 0, ch_inst = 0; 4961 4962 /* 4963 * If the error was generated in UMC_V2, which belongs to GPU UMCs, 4964 * and error occurred in DramECC (Extended error code = 0) then only 4965 * process the error, else bail out. 4966 */ 4967 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && 4968 (XEC(m->status, 0x3f) == 0x0))) 4969 return NOTIFY_DONE; 4970 4971 /* 4972 * If it is correctable error, return. 4973 */ 4974 if (mce_is_correctable(m)) 4975 return NOTIFY_OK; 4976 4977 /* 4978 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. 4979 */ 4980 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; 4981 4982 adev = find_adev(gpu_id); 4983 if (!adev) { 4984 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__, 4985 gpu_id); 4986 return NOTIFY_DONE; 4987 } 4988 4989 /* 4990 * If it is uncorrectable error, then find out UMC instance and 4991 * channel index. 4992 */ 4993 umc_inst = GET_UMC_INST(m->ipid); 4994 ch_inst = GET_CHAN_INDEX(m->ipid); 4995 4996 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", 4997 umc_inst, ch_inst); 4998 4999 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) 5000 return NOTIFY_OK; 5001 else 5002 return NOTIFY_DONE; 5003 } 5004 5005 static struct notifier_block amdgpu_bad_page_nb = { 5006 .notifier_call = amdgpu_bad_page_notifier, 5007 .priority = MCE_PRIO_UC, 5008 }; 5009 5010 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) 5011 { 5012 /* 5013 * Add the adev to the mce_adev_list. 5014 * During mode2 reset, amdgpu device is temporarily 5015 * removed from the mgpu_info list which can cause 5016 * page retirement to fail. 5017 * Use this list instead of mgpu_info to find the amdgpu 5018 * device on which the UMC error was reported. 5019 */ 5020 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; 5021 5022 /* 5023 * Register the x86 notifier only once 5024 * with MCE subsystem. 5025 */ 5026 if (notifier_registered == false) { 5027 mce_register_decode_chain(&amdgpu_bad_page_nb); 5028 notifier_registered = true; 5029 } 5030 } 5031 static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev) 5032 { 5033 int i, j; 5034 5035 if (!notifier_registered && !mce_adev_list.num_gpu) 5036 return; 5037 for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) { 5038 if (mce_adev_list.devs[i] == adev) 5039 mce_adev_list.devs[i] = NULL; 5040 if (!mce_adev_list.devs[i]) 5041 ++j; 5042 } 5043 5044 if (j == mce_adev_list.num_gpu) { 5045 mce_adev_list.num_gpu = 0; 5046 /* Unregister x86 notifier with MCE subsystem. */ 5047 if (notifier_registered) { 5048 mce_unregister_decode_chain(&amdgpu_bad_page_nb); 5049 notifier_registered = false; 5050 } 5051 } 5052 } 5053 #endif 5054 5055 struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) 5056 { 5057 if (!adev) 5058 return NULL; 5059 5060 return adev->psp.ras_context.ras; 5061 } 5062 5063 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) 5064 { 5065 if (!adev) 5066 return -EINVAL; 5067 5068 adev->psp.ras_context.ras = ras_con; 5069 return 0; 5070 } 5071 5072 /* check if ras is supported on block, say, sdma, gfx */ 5073 int amdgpu_ras_is_supported(struct amdgpu_device *adev, 5074 unsigned int block) 5075 { 5076 int ret = 0; 5077 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5078 5079 if (block >= AMDGPU_RAS_BLOCK_COUNT) 5080 return 0; 5081 5082 ret = ras && (adev->ras_enabled & (1 << block)); 5083 5084 /* For the special asic with mem ecc enabled but sram ecc 5085 * not enabled, even if the ras block is not supported on 5086 * .ras_enabled, if the asic supports poison mode and the 5087 * ras block has ras configuration, it can be considered 5088 * that the ras block supports ras function. 5089 */ 5090 if (!ret && 5091 (block == AMDGPU_RAS_BLOCK__GFX || 5092 block == AMDGPU_RAS_BLOCK__SDMA || 5093 block == AMDGPU_RAS_BLOCK__VCN || 5094 block == AMDGPU_RAS_BLOCK__JPEG) && 5095 (amdgpu_ras_mask & (1 << block)) && 5096 amdgpu_ras_is_poison_mode_supported(adev) && 5097 amdgpu_ras_get_ras_block(adev, block, 0)) 5098 ret = 1; 5099 5100 return ret; 5101 } 5102 5103 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) 5104 { 5105 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5106 5107 /* mode1 is the only selection for RMA status */ 5108 if (amdgpu_ras_is_rma(adev)) { 5109 ras->gpu_reset_flags = 0; 5110 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 5111 } 5112 5113 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { 5114 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 5115 int hive_ras_recovery = 0; 5116 5117 if (hive) { 5118 hive_ras_recovery = atomic_read(&hive->ras_recovery); 5119 amdgpu_put_xgmi_hive(hive); 5120 } 5121 /* In the case of multiple GPUs, after a GPU has started 5122 * resetting all GPUs on hive, other GPUs do not need to 5123 * trigger GPU reset again. 5124 */ 5125 if (!hive_ras_recovery) 5126 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 5127 else 5128 atomic_set(&ras->in_recovery, 0); 5129 } else { 5130 flush_work(&ras->recovery_work); 5131 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 5132 } 5133 5134 return 0; 5135 } 5136 5137 int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) 5138 { 5139 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5140 int ret = 0; 5141 5142 if (con) { 5143 ret = amdgpu_mca_smu_set_debug_mode(adev, enable); 5144 if (!ret) 5145 con->is_aca_debug_mode = enable; 5146 } 5147 5148 return ret; 5149 } 5150 5151 int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) 5152 { 5153 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5154 int ret = 0; 5155 5156 if (con) { 5157 if (amdgpu_aca_is_enabled(adev)) 5158 ret = amdgpu_aca_smu_set_debug_mode(adev, enable); 5159 else 5160 ret = amdgpu_mca_smu_set_debug_mode(adev, enable); 5161 if (!ret) 5162 con->is_aca_debug_mode = enable; 5163 } 5164 5165 return ret; 5166 } 5167 5168 bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev) 5169 { 5170 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5171 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 5172 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 5173 5174 if (!con) 5175 return false; 5176 5177 if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || 5178 (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) 5179 return con->is_aca_debug_mode; 5180 else 5181 return true; 5182 } 5183 5184 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, 5185 unsigned int *error_query_mode) 5186 { 5187 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5188 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 5189 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 5190 5191 if (!con) { 5192 *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; 5193 return false; 5194 } 5195 5196 if (amdgpu_sriov_vf(adev)) { 5197 *error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY; 5198 } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) { 5199 *error_query_mode = 5200 (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; 5201 } else { 5202 *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; 5203 } 5204 5205 return true; 5206 } 5207 5208 /* Register each ip ras block into amdgpu ras */ 5209 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, 5210 struct amdgpu_ras_block_object *ras_block_obj) 5211 { 5212 struct amdgpu_ras_block_list *ras_node; 5213 if (!adev || !ras_block_obj) 5214 return -EINVAL; 5215 5216 ras_node = kzalloc_obj(*ras_node); 5217 if (!ras_node) 5218 return -ENOMEM; 5219 5220 INIT_LIST_HEAD(&ras_node->node); 5221 ras_node->ras_obj = ras_block_obj; 5222 list_add_tail(&ras_node->node, &adev->ras_list); 5223 5224 return 0; 5225 } 5226 5227 void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) 5228 { 5229 if (!err_type_name) 5230 return; 5231 5232 switch (err_type) { 5233 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: 5234 sprintf(err_type_name, "correctable"); 5235 break; 5236 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: 5237 sprintf(err_type_name, "uncorrectable"); 5238 break; 5239 default: 5240 sprintf(err_type_name, "unknown"); 5241 break; 5242 } 5243 } 5244 5245 bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, 5246 const struct amdgpu_ras_err_status_reg_entry *reg_entry, 5247 uint32_t instance, 5248 uint32_t *memory_id) 5249 { 5250 uint32_t err_status_lo_data, err_status_lo_offset; 5251 5252 if (!reg_entry) 5253 return false; 5254 5255 err_status_lo_offset = 5256 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 5257 reg_entry->seg_lo, reg_entry->reg_lo); 5258 err_status_lo_data = RREG32(err_status_lo_offset); 5259 5260 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && 5261 !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) 5262 return false; 5263 5264 *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); 5265 5266 return true; 5267 } 5268 5269 bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, 5270 const struct amdgpu_ras_err_status_reg_entry *reg_entry, 5271 uint32_t instance, 5272 unsigned long *err_cnt) 5273 { 5274 uint32_t err_status_hi_data, err_status_hi_offset; 5275 5276 if (!reg_entry) 5277 return false; 5278 5279 err_status_hi_offset = 5280 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 5281 reg_entry->seg_hi, reg_entry->reg_hi); 5282 err_status_hi_data = RREG32(err_status_hi_offset); 5283 5284 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && 5285 !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) 5286 /* keep the check here in case we need to refer to the result later */ 5287 dev_dbg(adev->dev, "Invalid err_info field\n"); 5288 5289 /* read err count */ 5290 *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); 5291 5292 return true; 5293 } 5294 5295 void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, 5296 const struct amdgpu_ras_err_status_reg_entry *reg_list, 5297 uint32_t reg_list_size, 5298 const struct amdgpu_ras_memory_id_entry *mem_list, 5299 uint32_t mem_list_size, 5300 uint32_t instance, 5301 uint32_t err_type, 5302 unsigned long *err_count) 5303 { 5304 uint32_t memory_id; 5305 unsigned long err_cnt; 5306 char err_type_name[16]; 5307 uint32_t i, j; 5308 5309 for (i = 0; i < reg_list_size; i++) { 5310 /* query memory_id from err_status_lo */ 5311 if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], 5312 instance, &memory_id)) 5313 continue; 5314 5315 /* query err_cnt from err_status_hi */ 5316 if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], 5317 instance, &err_cnt) || 5318 !err_cnt) 5319 continue; 5320 5321 *err_count += err_cnt; 5322 5323 /* log the errors */ 5324 amdgpu_ras_get_error_type_name(err_type, err_type_name); 5325 if (!mem_list) { 5326 /* memory_list is not supported */ 5327 dev_info(adev->dev, 5328 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", 5329 err_cnt, err_type_name, 5330 reg_list[i].block_name, 5331 instance, memory_id); 5332 } else { 5333 for (j = 0; j < mem_list_size; j++) { 5334 if (memory_id == mem_list[j].memory_id) { 5335 dev_info(adev->dev, 5336 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", 5337 err_cnt, err_type_name, 5338 reg_list[i].block_name, 5339 instance, mem_list[j].name); 5340 break; 5341 } 5342 } 5343 } 5344 } 5345 } 5346 5347 void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, 5348 const struct amdgpu_ras_err_status_reg_entry *reg_list, 5349 uint32_t reg_list_size, 5350 uint32_t instance) 5351 { 5352 uint32_t err_status_lo_offset, err_status_hi_offset; 5353 uint32_t i; 5354 5355 for (i = 0; i < reg_list_size; i++) { 5356 err_status_lo_offset = 5357 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 5358 reg_list[i].seg_lo, reg_list[i].reg_lo); 5359 err_status_hi_offset = 5360 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 5361 reg_list[i].seg_hi, reg_list[i].reg_hi); 5362 WREG32(err_status_lo_offset, 0); 5363 WREG32(err_status_hi_offset, 0); 5364 } 5365 } 5366 5367 int amdgpu_ras_error_data_init(struct ras_err_data *err_data) 5368 { 5369 memset(err_data, 0, sizeof(*err_data)); 5370 5371 INIT_LIST_HEAD(&err_data->err_node_list); 5372 5373 return 0; 5374 } 5375 5376 static void amdgpu_ras_error_node_release(struct ras_err_node *err_node) 5377 { 5378 if (!err_node) 5379 return; 5380 5381 list_del(&err_node->node); 5382 kvfree(err_node); 5383 } 5384 5385 void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) 5386 { 5387 struct ras_err_node *err_node, *tmp; 5388 5389 list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) 5390 amdgpu_ras_error_node_release(err_node); 5391 } 5392 5393 static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, 5394 struct amdgpu_smuio_mcm_config_info *mcm_info) 5395 { 5396 struct ras_err_node *err_node; 5397 struct amdgpu_smuio_mcm_config_info *ref_id; 5398 5399 if (!err_data || !mcm_info) 5400 return NULL; 5401 5402 for_each_ras_error(err_node, err_data) { 5403 ref_id = &err_node->err_info.mcm_info; 5404 5405 if (mcm_info->socket_id == ref_id->socket_id && 5406 mcm_info->die_id == ref_id->die_id) 5407 return err_node; 5408 } 5409 5410 return NULL; 5411 } 5412 5413 static struct ras_err_node *amdgpu_ras_error_node_new(void) 5414 { 5415 struct ras_err_node *err_node; 5416 5417 err_node = kvzalloc_obj(*err_node); 5418 if (!err_node) 5419 return NULL; 5420 5421 INIT_LIST_HEAD(&err_node->node); 5422 5423 return err_node; 5424 } 5425 5426 static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b) 5427 { 5428 struct ras_err_node *nodea = container_of(a, struct ras_err_node, node); 5429 struct ras_err_node *nodeb = container_of(b, struct ras_err_node, node); 5430 struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; 5431 struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; 5432 5433 if (unlikely(infoa->socket_id != infob->socket_id)) 5434 return infoa->socket_id - infob->socket_id; 5435 else 5436 return infoa->die_id - infob->die_id; 5437 5438 return 0; 5439 } 5440 5441 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, 5442 struct amdgpu_smuio_mcm_config_info *mcm_info) 5443 { 5444 struct ras_err_node *err_node; 5445 5446 err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info); 5447 if (err_node) 5448 return &err_node->err_info; 5449 5450 err_node = amdgpu_ras_error_node_new(); 5451 if (!err_node) 5452 return NULL; 5453 5454 memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); 5455 5456 err_data->err_list_count++; 5457 list_add_tail(&err_node->node, &err_data->err_node_list); 5458 list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); 5459 5460 return &err_node->err_info; 5461 } 5462 5463 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, 5464 struct amdgpu_smuio_mcm_config_info *mcm_info, 5465 u64 count) 5466 { 5467 struct ras_err_info *err_info; 5468 5469 if (!err_data || !mcm_info) 5470 return -EINVAL; 5471 5472 if (!count) 5473 return 0; 5474 5475 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5476 if (!err_info) 5477 return -EINVAL; 5478 5479 err_info->ue_count += count; 5480 err_data->ue_count += count; 5481 5482 return 0; 5483 } 5484 5485 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, 5486 struct amdgpu_smuio_mcm_config_info *mcm_info, 5487 u64 count) 5488 { 5489 struct ras_err_info *err_info; 5490 5491 if (!err_data || !mcm_info) 5492 return -EINVAL; 5493 5494 if (!count) 5495 return 0; 5496 5497 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5498 if (!err_info) 5499 return -EINVAL; 5500 5501 err_info->ce_count += count; 5502 err_data->ce_count += count; 5503 5504 return 0; 5505 } 5506 5507 int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, 5508 struct amdgpu_smuio_mcm_config_info *mcm_info, 5509 u64 count) 5510 { 5511 struct ras_err_info *err_info; 5512 5513 if (!err_data || !mcm_info) 5514 return -EINVAL; 5515 5516 if (!count) 5517 return 0; 5518 5519 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5520 if (!err_info) 5521 return -EINVAL; 5522 5523 err_info->de_count += count; 5524 err_data->de_count += count; 5525 5526 return 0; 5527 } 5528 5529 #define mmMP0_SMN_C2PMSG_92 0x1609C 5530 #define mmMP0_SMN_C2PMSG_126 0x160BE 5531 static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, 5532 u32 instance) 5533 { 5534 u32 socket_id, aid_id, hbm_id; 5535 u32 fw_status; 5536 u32 boot_error; 5537 u64 reg_addr; 5538 5539 /* The pattern for smn addressing in other SOC could be different from 5540 * the one for aqua_vanjaram. We should revisit the code if the pattern 5541 * is changed. In such case, replace the aqua_vanjaram implementation 5542 * with more common helper */ 5543 reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + 5544 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5545 fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5546 5547 reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + 5548 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5549 boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5550 5551 socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); 5552 aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); 5553 hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1); 5554 5555 if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) 5556 dev_info(adev->dev, 5557 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", 5558 socket_id, aid_id, hbm_id, fw_status); 5559 5560 if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) 5561 dev_info(adev->dev, 5562 "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", 5563 socket_id, aid_id, fw_status); 5564 5565 if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) 5566 dev_info(adev->dev, 5567 "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", 5568 socket_id, aid_id, fw_status); 5569 5570 if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) 5571 dev_info(adev->dev, 5572 "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", 5573 socket_id, aid_id, fw_status); 5574 5575 if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) 5576 dev_info(adev->dev, 5577 "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", 5578 socket_id, aid_id, fw_status); 5579 5580 if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) 5581 dev_info(adev->dev, 5582 "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", 5583 socket_id, aid_id, fw_status); 5584 5585 if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) 5586 dev_info(adev->dev, 5587 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", 5588 socket_id, aid_id, hbm_id, fw_status); 5589 5590 if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) 5591 dev_info(adev->dev, 5592 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", 5593 socket_id, aid_id, hbm_id, fw_status); 5594 5595 if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error)) 5596 dev_info(adev->dev, 5597 "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n", 5598 socket_id, aid_id, fw_status); 5599 5600 if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error)) 5601 dev_info(adev->dev, 5602 "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n", 5603 socket_id, aid_id, fw_status); 5604 } 5605 5606 static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, 5607 u32 instance) 5608 { 5609 u64 reg_addr; 5610 u32 reg_data; 5611 int retry_loop; 5612 5613 reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + 5614 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5615 5616 for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { 5617 reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5618 if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) 5619 return false; 5620 else 5621 msleep(1); 5622 } 5623 5624 return true; 5625 } 5626 5627 void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) 5628 { 5629 u32 i; 5630 5631 for (i = 0; i < num_instances; i++) { 5632 if (amdgpu_ras_boot_error_detected(adev, i)) 5633 amdgpu_ras_boot_time_error_reporting(adev, i); 5634 } 5635 } 5636 5637 int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) 5638 { 5639 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5640 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; 5641 uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; 5642 int ret = 0; 5643 5644 if (amdgpu_ras_check_critical_address(adev, start)) 5645 return 0; 5646 5647 mutex_lock(&con->page_rsv_lock); 5648 ret = amdgpu_vram_mgr_query_page_status(mgr, start); 5649 if (ret == -ENOENT) 5650 ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE); 5651 mutex_unlock(&con->page_rsv_lock); 5652 5653 return ret; 5654 } 5655 5656 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, 5657 const char *fmt, ...) 5658 { 5659 struct va_format vaf; 5660 va_list args; 5661 5662 va_start(args, fmt); 5663 vaf.fmt = fmt; 5664 vaf.va = &args; 5665 5666 if (RAS_EVENT_ID_IS_VALID(event_id)) 5667 dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); 5668 else 5669 dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); 5670 5671 va_end(args); 5672 } 5673 5674 bool amdgpu_ras_is_rma(struct amdgpu_device *adev) 5675 { 5676 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5677 5678 if (amdgpu_uniras_enabled(adev)) 5679 return amdgpu_ras_mgr_is_rma(adev); 5680 5681 if (!con) 5682 return false; 5683 5684 return con->is_rma; 5685 } 5686 5687 int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, 5688 struct amdgpu_bo *bo) 5689 { 5690 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5691 struct amdgpu_vram_mgr_resource *vres; 5692 struct ras_critical_region *region; 5693 struct gpu_buddy_block *block; 5694 int ret = 0; 5695 5696 if (!bo || !bo->tbo.resource) 5697 return -EINVAL; 5698 5699 vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); 5700 5701 mutex_lock(&con->critical_region_lock); 5702 5703 /* Check if the bo had been recorded */ 5704 list_for_each_entry(region, &con->critical_region_head, node) 5705 if (region->bo == bo) 5706 goto out; 5707 5708 /* Record new critical amdgpu bo */ 5709 list_for_each_entry(block, &vres->blocks, link) { 5710 region = kzalloc_obj(*region); 5711 if (!region) { 5712 ret = -ENOMEM; 5713 goto out; 5714 } 5715 region->bo = bo; 5716 region->start = amdgpu_vram_mgr_block_start(block); 5717 region->size = amdgpu_vram_mgr_block_size(block); 5718 list_add_tail(®ion->node, &con->critical_region_head); 5719 } 5720 5721 out: 5722 mutex_unlock(&con->critical_region_lock); 5723 5724 return ret; 5725 } 5726 5727 static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) 5728 { 5729 amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory); 5730 } 5731 5732 static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) 5733 { 5734 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5735 struct ras_critical_region *region, *tmp; 5736 5737 mutex_lock(&con->critical_region_lock); 5738 list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { 5739 list_del(®ion->node); 5740 kfree(region); 5741 } 5742 mutex_unlock(&con->critical_region_lock); 5743 } 5744 5745 bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) 5746 { 5747 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5748 struct ras_critical_region *region; 5749 bool ret = false; 5750 5751 mutex_lock(&con->critical_region_lock); 5752 list_for_each_entry(region, &con->critical_region_head, node) { 5753 if ((region->start <= addr) && 5754 (addr < (region->start + region->size))) { 5755 ret = true; 5756 break; 5757 } 5758 } 5759 mutex_unlock(&con->critical_region_lock); 5760 5761 return ret; 5762 } 5763 5764 void amdgpu_ras_pre_reset(struct amdgpu_device *adev, 5765 struct list_head *device_list) 5766 { 5767 struct amdgpu_device *tmp_adev = NULL; 5768 5769 list_for_each_entry(tmp_adev, device_list, reset_list) { 5770 if (amdgpu_uniras_enabled(tmp_adev)) 5771 amdgpu_ras_mgr_pre_reset(tmp_adev); 5772 } 5773 } 5774 5775 void amdgpu_ras_post_reset(struct amdgpu_device *adev, 5776 struct list_head *device_list) 5777 { 5778 struct amdgpu_device *tmp_adev = NULL; 5779 5780 list_for_each_entry(tmp_adev, device_list, reset_list) { 5781 if (amdgpu_uniras_enabled(tmp_adev)) 5782 amdgpu_ras_mgr_post_reset(tmp_adev); 5783 } 5784 } 5785