1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include "amdgpu_reg_access.h" 25 #include <linux/debugfs.h> 26 #include <linux/list.h> 27 #include <linux/module.h> 28 #include <linux/uaccess.h> 29 #include <linux/reboot.h> 30 #include <linux/syscalls.h> 31 #include <linux/pm_runtime.h> 32 #include <linux/list_sort.h> 33 34 #include "amdgpu.h" 35 #include "amdgpu_ras.h" 36 #include "amdgpu_atomfirmware.h" 37 #include "amdgpu_xgmi.h" 38 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 39 #include "nbio_v4_3.h" 40 #include "nbif_v6_3_1.h" 41 #include "nbio_v7_9.h" 42 #include "atom.h" 43 #include "amdgpu_reset.h" 44 #include "amdgpu_psp.h" 45 #include "amdgpu_ras_mgr.h" 46 #include "amdgpu_virt_ras_cmd.h" 47 48 #ifdef CONFIG_X86_MCE_AMD 49 #include <asm/mce.h> 50 51 static bool notifier_registered; 52 #endif 53 static const char *RAS_FS_NAME = "ras"; 54 55 const char *ras_error_string[] = { 56 "none", 57 "parity", 58 "single_correctable", 59 "multi_uncorrectable", 60 "poison", 61 }; 62 63 const char *ras_block_string[] = { 64 "umc", 65 "sdma", 66 "gfx", 67 "mmhub", 68 "athub", 69 "pcie_bif", 70 "hdp", 71 "xgmi_wafl", 72 "df", 73 "smn", 74 "sem", 75 "mp0", 76 "mp1", 77 "fuse", 78 "mca", 79 "vcn", 80 "jpeg", 81 "ih", 82 "mpio", 83 "mmsch", 84 }; 85 86 const char *ras_mca_block_string[] = { 87 "mca_mp0", 88 "mca_mp1", 89 "mca_mpio", 90 "mca_iohc", 91 }; 92 93 struct amdgpu_ras_block_list { 94 /* ras block link */ 95 struct list_head node; 96 97 struct amdgpu_ras_block_object *ras_obj; 98 }; 99 100 const char *get_ras_block_str(struct ras_common_if *ras_block) 101 { 102 if (!ras_block) 103 return "NULL"; 104 105 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT || 106 ras_block->block >= ARRAY_SIZE(ras_block_string)) 107 return "OUT OF RANGE"; 108 109 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) 110 return ras_mca_block_string[ras_block->sub_block_index]; 111 112 return ras_block_string[ras_block->block]; 113 } 114 115 #define ras_block_str(_BLOCK_) \ 116 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") 117 118 #define ras_err_str(i) (ras_error_string[ffs(i)]) 119 120 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 121 122 /* inject address is 52 bits */ 123 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) 124 125 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 126 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) 127 128 #define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 129 130 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms 131 132 #define MAX_FLUSH_RETIRE_DWORK_TIMES 100 133 134 #define BYPASS_ALLOCATED_ADDRESS 0x0 135 #define BYPASS_INITIALIZATION_ADDRESS 0x1 136 137 enum amdgpu_ras_retire_page_reservation { 138 AMDGPU_RAS_RETIRE_PAGE_RESERVED, 139 AMDGPU_RAS_RETIRE_PAGE_PENDING, 140 AMDGPU_RAS_RETIRE_PAGE_FAULT, 141 }; 142 143 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); 144 145 static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 146 uint64_t addr); 147 static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 148 uint64_t addr); 149 150 static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); 151 static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); 152 153 #ifdef CONFIG_X86_MCE_AMD 154 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); 155 static void 156 amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev); 157 struct mce_notifier_adev_list { 158 struct amdgpu_device *devs[MAX_GPU_INSTANCE]; 159 int num_gpu; 160 }; 161 static struct mce_notifier_adev_list mce_adev_list; 162 #endif 163 164 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) 165 { 166 if (adev && amdgpu_ras_get_context(adev)) 167 amdgpu_ras_get_context(adev)->error_query_ready = ready; 168 } 169 170 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) 171 { 172 if (adev && amdgpu_ras_get_context(adev)) 173 return amdgpu_ras_get_context(adev)->error_query_ready; 174 175 return false; 176 } 177 178 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) 179 { 180 struct ras_err_data err_data; 181 struct eeprom_table_record err_rec; 182 int ret; 183 184 ret = amdgpu_ras_check_bad_page(adev, address); 185 if (ret == -EINVAL) { 186 dev_warn(adev->dev, 187 "RAS WARN: input address 0x%llx is invalid.\n", 188 address); 189 return -EINVAL; 190 } else if (ret == 1) { 191 dev_warn(adev->dev, 192 "RAS WARN: 0x%llx has already been marked as bad page!\n", 193 address); 194 return 0; 195 } 196 197 ret = amdgpu_ras_error_data_init(&err_data); 198 if (ret) 199 return ret; 200 201 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); 202 err_data.err_addr = &err_rec; 203 amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); 204 205 if (amdgpu_bad_page_threshold != 0) { 206 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 207 err_data.err_addr_cnt, false); 208 amdgpu_ras_save_bad_pages(adev, NULL); 209 } 210 211 amdgpu_ras_error_data_fini(&err_data); 212 213 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); 214 dev_warn(adev->dev, "Clear EEPROM:\n"); 215 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); 216 217 return 0; 218 } 219 220 static int amdgpu_check_address_validity(struct amdgpu_device *adev, 221 uint64_t address, uint64_t flags) 222 { 223 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 224 struct amdgpu_vram_block_info blk_info; 225 uint64_t page_pfns[32] = {0}; 226 int i, ret, count; 227 bool hit = false; 228 229 if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) 230 return 0; 231 232 if (amdgpu_sriov_vf(adev)) { 233 if (amdgpu_uniras_enabled(adev)) { 234 if (amdgpu_virt_ras_check_address_validity(adev, address, &hit)) 235 return -EPERM; 236 if (hit) 237 return -EACCES; 238 } else { 239 if (amdgpu_virt_check_vf_critical_region(adev, address, &hit)) 240 return -EPERM; 241 return hit ? -EACCES : 0; 242 } 243 } 244 245 if ((address >= adev->gmc.mc_vram_size) || 246 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) 247 return -EFAULT; 248 249 if (amdgpu_uniras_enabled(adev)) { 250 if (amdgpu_sriov_vf(adev)) 251 count = amdgpu_virt_ras_convert_retired_address(adev, address, 252 page_pfns, ARRAY_SIZE(page_pfns)); 253 else 254 count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address, 255 page_pfns, ARRAY_SIZE(page_pfns)); 256 } else 257 count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, 258 address, page_pfns, ARRAY_SIZE(page_pfns)); 259 260 if (count <= 0) 261 return -EPERM; 262 263 for (i = 0; i < count; i++) { 264 memset(&blk_info, 0, sizeof(blk_info)); 265 ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr, 266 page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info); 267 if (!ret) { 268 /* The input address that needs to be checked is allocated by 269 * current calling process, so it is necessary to exclude 270 * the calling process. 271 */ 272 if ((flags == BYPASS_ALLOCATED_ADDRESS) && 273 ((blk_info.task.pid != task_pid_nr(current)) || 274 strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN))) 275 return -EACCES; 276 else if ((flags == BYPASS_INITIALIZATION_ADDRESS) && 277 (blk_info.task.pid == con->init_task_pid) && 278 !strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN)) 279 return -EACCES; 280 } 281 } 282 283 return 0; 284 } 285 286 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 287 size_t size, loff_t *pos) 288 { 289 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 290 struct ras_query_if info = { 291 .head = obj->head, 292 }; 293 ssize_t s; 294 char val[128]; 295 296 if (amdgpu_ras_query_error_status(obj->adev, &info)) 297 return -EINVAL; 298 299 /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ 300 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 301 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 302 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 303 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 304 } 305 306 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 307 "ue", info.ue_count, 308 "ce", info.ce_count); 309 if (*pos >= s) 310 return 0; 311 312 s -= *pos; 313 s = min_t(u64, s, size); 314 315 316 if (copy_to_user(buf, &val[*pos], s)) 317 return -EINVAL; 318 319 *pos += s; 320 321 return s; 322 } 323 324 static const struct file_operations amdgpu_ras_debugfs_ops = { 325 .owner = THIS_MODULE, 326 .read = amdgpu_ras_debugfs_read, 327 .write = NULL, 328 .llseek = default_llseek 329 }; 330 331 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 332 { 333 int i; 334 335 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 336 *block_id = i; 337 if (strcmp(name, ras_block_string[i]) == 0) 338 return 0; 339 } 340 return -EINVAL; 341 } 342 343 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 344 const char __user *buf, size_t size, 345 loff_t *pos, struct ras_debug_if *data) 346 { 347 ssize_t s = min_t(u64, 64, size); 348 char str[65]; 349 char block_name[33]; 350 char err[9] = "ue"; 351 int op = -1; 352 int block_id; 353 uint32_t sub_block; 354 u64 address, value; 355 /* default value is 0 if the mask is not set by user */ 356 u32 instance_mask = 0; 357 358 if (*pos) 359 return -EINVAL; 360 *pos = size; 361 362 memset(str, 0, sizeof(str)); 363 memset(data, 0, sizeof(*data)); 364 365 if (copy_from_user(str, buf, s)) 366 return -EINVAL; 367 368 if (sscanf(str, "disable %32s", block_name) == 1) 369 op = 0; 370 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 371 op = 1; 372 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 373 op = 2; 374 else if (strstr(str, "retire_page") != NULL) 375 op = 3; 376 else if (strstr(str, "check_address") != NULL) 377 op = 4; 378 else if (str[0] && str[1] && str[2] && str[3]) 379 /* ascii string, but commands are not matched. */ 380 return -EINVAL; 381 382 if (op != -1) { 383 if (op == 3) { 384 if (sscanf(str, "%*s 0x%llx", &address) != 1 && 385 sscanf(str, "%*s %llu", &address) != 1) 386 return -EINVAL; 387 388 data->op = op; 389 data->inject.address = address; 390 391 return 0; 392 } else if (op == 4) { 393 if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 && 394 sscanf(str, "%*s %llu %llu", &address, &value) != 2) 395 return -EINVAL; 396 397 data->op = op; 398 data->inject.address = address; 399 data->inject.value = value; 400 return 0; 401 } 402 403 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 404 return -EINVAL; 405 406 data->head.block = block_id; 407 /* only ue, ce and poison errors are supported */ 408 if (!memcmp("ue", err, 2)) 409 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 410 else if (!memcmp("ce", err, 2)) 411 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 412 else if (!memcmp("poison", err, 6)) 413 data->head.type = AMDGPU_RAS_ERROR__POISON; 414 else 415 return -EINVAL; 416 417 data->op = op; 418 419 if (op == 2) { 420 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", 421 &sub_block, &address, &value, &instance_mask) != 4 && 422 sscanf(str, "%*s %*s %*s %u %llu %llu %u", 423 &sub_block, &address, &value, &instance_mask) != 4 && 424 sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", 425 &sub_block, &address, &value) != 3 && 426 sscanf(str, "%*s %*s %*s %u %llu %llu", 427 &sub_block, &address, &value) != 3) 428 return -EINVAL; 429 data->head.sub_block_index = sub_block; 430 data->inject.address = address; 431 data->inject.value = value; 432 data->inject.instance_mask = instance_mask; 433 } 434 } else { 435 if (size < sizeof(*data)) 436 return -EINVAL; 437 438 if (copy_from_user(data, buf, sizeof(*data))) 439 return -EINVAL; 440 } 441 442 return 0; 443 } 444 445 static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, 446 struct ras_debug_if *data) 447 { 448 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; 449 uint32_t mask, inst_mask = data->inject.instance_mask; 450 451 /* no need to set instance mask if there is only one instance */ 452 if (num_xcc <= 1 && inst_mask) { 453 data->inject.instance_mask = 0; 454 dev_dbg(adev->dev, 455 "RAS inject mask(0x%x) isn't supported and force it to 0.\n", 456 inst_mask); 457 458 return; 459 } 460 461 switch (data->head.block) { 462 case AMDGPU_RAS_BLOCK__GFX: 463 mask = GENMASK(num_xcc - 1, 0); 464 break; 465 case AMDGPU_RAS_BLOCK__SDMA: 466 mask = GENMASK(adev->sdma.num_instances - 1, 0); 467 break; 468 case AMDGPU_RAS_BLOCK__VCN: 469 case AMDGPU_RAS_BLOCK__JPEG: 470 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); 471 break; 472 default: 473 mask = inst_mask; 474 break; 475 } 476 477 /* remove invalid bits in instance mask */ 478 data->inject.instance_mask &= mask; 479 if (inst_mask != data->inject.instance_mask) 480 dev_dbg(adev->dev, 481 "Adjust RAS inject mask 0x%x to 0x%x\n", 482 inst_mask, data->inject.instance_mask); 483 } 484 485 /** 486 * DOC: AMDGPU RAS debugfs control interface 487 * 488 * The control interface accepts struct ras_debug_if which has two members. 489 * 490 * First member: ras_debug_if::head or ras_debug_if::inject. 491 * 492 * head is used to indicate which IP block will be under control. 493 * 494 * head has four members, they are block, type, sub_block_index, name. 495 * block: which IP will be under control. 496 * type: what kind of error will be enabled/disabled/injected. 497 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 498 * name: the name of IP. 499 * 500 * inject has three more members than head, they are address, value and mask. 501 * As their names indicate, inject operation will write the 502 * value to the address. 503 * 504 * The second member: struct ras_debug_if::op. 505 * It has three kinds of operations. 506 * 507 * - 0: disable RAS on the block. Take ::head as its data. 508 * - 1: enable RAS on the block. Take ::head as its data. 509 * - 2: inject errors on the block. Take ::inject as its data. 510 * 511 * How to use the interface? 512 * 513 * In a program 514 * 515 * Copy the struct ras_debug_if in your code and initialize it. 516 * Write the struct to the control interface. 517 * 518 * From shell 519 * 520 * .. code-block:: bash 521 * 522 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 523 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 524 * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 525 * 526 * Where N, is the card which you want to affect. 527 * 528 * "disable" requires only the block. 529 * "enable" requires the block and error type. 530 * "inject" requires the block, error type, address, and value. 531 * 532 * The block is one of: umc, sdma, gfx, etc. 533 * see ras_block_string[] for details 534 * 535 * The error type is one of: ue, ce and poison where, 536 * ue is multi-uncorrectable 537 * ce is single-correctable 538 * poison is poison 539 * 540 * The sub-block is a the sub-block index, pass 0 if there is no sub-block. 541 * The address and value are hexadecimal numbers, leading 0x is optional. 542 * The mask means instance mask, is optional, default value is 0x1. 543 * 544 * For instance, 545 * 546 * .. code-block:: bash 547 * 548 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 549 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl 550 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 551 * 552 * How to check the result of the operation? 553 * 554 * To check disable/enable, see "ras" features at, 555 * /sys/class/drm/card[0/1/2...]/device/ras/features 556 * 557 * To check inject, see the corresponding error count at, 558 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count 559 * 560 * .. note:: 561 * Operations are only allowed on blocks which are supported. 562 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask 563 * to see which blocks support RAS on a particular asic. 564 * 565 */ 566 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, 567 const char __user *buf, 568 size_t size, loff_t *pos) 569 { 570 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 571 struct ras_debug_if data; 572 int ret = 0; 573 574 if (!amdgpu_ras_get_error_query_ready(adev)) { 575 dev_warn(adev->dev, "RAS WARN: error injection " 576 "currently inaccessible\n"); 577 return size; 578 } 579 580 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 581 if (ret) 582 return ret; 583 584 if (data.op == 3) { 585 ret = amdgpu_reserve_page_direct(adev, data.inject.address); 586 if (!ret) 587 return size; 588 else 589 return ret; 590 } else if (data.op == 4) { 591 ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value); 592 return ret ? ret : size; 593 } 594 595 if (!amdgpu_ras_is_supported(adev, data.head.block)) 596 return -EINVAL; 597 598 switch (data.op) { 599 case 0: 600 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 601 break; 602 case 1: 603 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 604 break; 605 case 2: 606 /* umc ce/ue error injection for a bad page is not allowed */ 607 if (data.head.block == AMDGPU_RAS_BLOCK__UMC) 608 ret = amdgpu_ras_check_bad_page(adev, data.inject.address); 609 if (ret == -EINVAL) { 610 dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", 611 data.inject.address); 612 break; 613 } else if (ret == 1) { 614 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", 615 data.inject.address); 616 break; 617 } 618 619 amdgpu_ras_instance_mask_check(adev, &data); 620 621 /* data.inject.address is offset instead of absolute gpu address */ 622 ret = amdgpu_ras_error_inject(adev, &data.inject); 623 break; 624 default: 625 ret = -EINVAL; 626 break; 627 } 628 629 if (ret) 630 return ret; 631 632 return size; 633 } 634 635 static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev); 636 637 /** 638 * DOC: AMDGPU RAS debugfs EEPROM table reset interface 639 * 640 * Some boards contain an EEPROM which is used to persistently store a list of 641 * bad pages which experiences ECC errors in vram. This interface provides 642 * a way to reset the EEPROM, e.g., after testing error injection. 643 * 644 * Usage: 645 * 646 * .. code-block:: bash 647 * 648 * echo 1 > ../ras/ras_eeprom_reset 649 * 650 * will reset EEPROM table to 0 entries. 651 * 652 */ 653 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, 654 const char __user *buf, 655 size_t size, loff_t *pos) 656 { 657 struct amdgpu_device *adev = 658 (struct amdgpu_device *)file_inode(f)->i_private; 659 int ret; 660 661 if (amdgpu_uniras_enabled(adev)) { 662 ret = amdgpu_uniras_clear_badpages_info(adev); 663 return ret ? ret : size; 664 } 665 666 ret = amdgpu_ras_eeprom_reset_table( 667 &(amdgpu_ras_get_context(adev)->eeprom_control)); 668 669 if (!ret) { 670 /* Something was written to EEPROM. 671 */ 672 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; 673 return size; 674 } else { 675 return ret; 676 } 677 } 678 679 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 680 .owner = THIS_MODULE, 681 .read = NULL, 682 .write = amdgpu_ras_debugfs_ctrl_write, 683 .llseek = default_llseek 684 }; 685 686 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { 687 .owner = THIS_MODULE, 688 .read = NULL, 689 .write = amdgpu_ras_debugfs_eeprom_write, 690 .llseek = default_llseek 691 }; 692 693 /** 694 * DOC: AMDGPU RAS sysfs Error Count Interface 695 * 696 * It allows the user to read the error count for each IP block on the gpu through 697 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 698 * 699 * It outputs the multiple lines which report the uncorrected (ue) and corrected 700 * (ce) error counts. 701 * 702 * The format of one line is below, 703 * 704 * [ce|ue]: count 705 * 706 * Example: 707 * 708 * .. code-block:: bash 709 * 710 * ue: 0 711 * ce: 1 712 * 713 */ 714 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 715 struct device_attribute *attr, char *buf) 716 { 717 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 718 struct ras_query_if info = { 719 .head = obj->head, 720 }; 721 722 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 723 return sysfs_emit(buf, "Query currently inaccessible\n"); 724 725 if (amdgpu_ras_query_error_status(obj->adev, &info)) 726 return -EINVAL; 727 728 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 729 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 730 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 731 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 732 } 733 734 if (info.head.block == AMDGPU_RAS_BLOCK__UMC) 735 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, 736 "ce", info.ce_count, "de", info.de_count); 737 else 738 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, 739 "ce", info.ce_count); 740 } 741 742 /* obj begin */ 743 744 #define get_obj(obj) do { (obj)->use++; } while (0) 745 #define alive_obj(obj) ((obj)->use) 746 747 static inline void put_obj(struct ras_manager *obj) 748 { 749 if (obj && (--obj->use == 0)) { 750 list_del(&obj->node); 751 amdgpu_ras_error_data_fini(&obj->err_data); 752 } 753 754 if (obj && (obj->use < 0)) 755 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); 756 } 757 758 /* make one obj and return it. */ 759 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 760 struct ras_common_if *head) 761 { 762 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 763 struct ras_manager *obj; 764 765 if (!adev->ras_enabled || !con) 766 return NULL; 767 768 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 769 return NULL; 770 771 if (head->block == AMDGPU_RAS_BLOCK__MCA) { 772 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 773 return NULL; 774 775 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 776 } else 777 obj = &con->objs[head->block]; 778 779 /* already exist. return obj? */ 780 if (alive_obj(obj)) 781 return NULL; 782 783 if (amdgpu_ras_error_data_init(&obj->err_data)) 784 return NULL; 785 786 obj->head = *head; 787 obj->adev = adev; 788 list_add(&obj->node, &con->head); 789 get_obj(obj); 790 791 return obj; 792 } 793 794 /* return an obj equal to head, or the first when head is NULL */ 795 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 796 struct ras_common_if *head) 797 { 798 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 799 struct ras_manager *obj; 800 int i; 801 802 if (!adev->ras_enabled || !con) 803 return NULL; 804 805 if (head) { 806 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 807 return NULL; 808 809 if (head->block == AMDGPU_RAS_BLOCK__MCA) { 810 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 811 return NULL; 812 813 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 814 } else 815 obj = &con->objs[head->block]; 816 817 if (alive_obj(obj)) 818 return obj; 819 } else { 820 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 821 obj = &con->objs[i]; 822 if (alive_obj(obj)) 823 return obj; 824 } 825 } 826 827 return NULL; 828 } 829 /* obj end */ 830 831 /* feature ctl begin */ 832 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 833 struct ras_common_if *head) 834 { 835 return adev->ras_hw_enabled & BIT(head->block); 836 } 837 838 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 839 struct ras_common_if *head) 840 { 841 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 842 843 return con->features & BIT(head->block); 844 } 845 846 /* 847 * if obj is not created, then create one. 848 * set feature enable flag. 849 */ 850 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 851 struct ras_common_if *head, int enable) 852 { 853 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 854 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 855 856 /* If hardware does not support ras, then do not create obj. 857 * But if hardware support ras, we can create the obj. 858 * Ras framework checks con->hw_supported to see if it need do 859 * corresponding initialization. 860 * IP checks con->support to see if it need disable ras. 861 */ 862 if (!amdgpu_ras_is_feature_allowed(adev, head)) 863 return 0; 864 865 if (enable) { 866 if (!obj) { 867 obj = amdgpu_ras_create_obj(adev, head); 868 if (!obj) 869 return -EINVAL; 870 } else { 871 /* In case we create obj somewhere else */ 872 get_obj(obj); 873 } 874 con->features |= BIT(head->block); 875 } else { 876 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 877 con->features &= ~BIT(head->block); 878 put_obj(obj); 879 } 880 } 881 882 return 0; 883 } 884 885 /* wrapper of psp_ras_enable_features */ 886 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 887 struct ras_common_if *head, bool enable) 888 { 889 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 890 union ta_ras_cmd_input *info; 891 int ret; 892 893 if (!con) 894 return -EINVAL; 895 896 /* For non-gfx ip, do not enable ras feature if it is not allowed */ 897 /* For gfx ip, regardless of feature support status, */ 898 /* Force issue enable or disable ras feature commands */ 899 if (head->block != AMDGPU_RAS_BLOCK__GFX && 900 !amdgpu_ras_is_feature_allowed(adev, head)) 901 return 0; 902 903 /* Only enable gfx ras feature from host side */ 904 if (head->block == AMDGPU_RAS_BLOCK__GFX && 905 !amdgpu_sriov_vf(adev) && 906 !amdgpu_ras_intr_triggered()) { 907 info = kzalloc_obj(union ta_ras_cmd_input); 908 if (!info) 909 return -ENOMEM; 910 911 if (!enable) { 912 info->disable_features = (struct ta_ras_disable_features_input) { 913 .block_id = amdgpu_ras_block_to_ta(head->block), 914 .error_type = amdgpu_ras_error_to_ta(head->type), 915 }; 916 } else { 917 info->enable_features = (struct ta_ras_enable_features_input) { 918 .block_id = amdgpu_ras_block_to_ta(head->block), 919 .error_type = amdgpu_ras_error_to_ta(head->type), 920 }; 921 } 922 923 ret = psp_ras_enable_features(&adev->psp, info, enable); 924 if (ret) { 925 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", 926 enable ? "enable":"disable", 927 get_ras_block_str(head), 928 amdgpu_ras_is_poison_mode_supported(adev), ret); 929 kfree(info); 930 return ret; 931 } 932 933 kfree(info); 934 } 935 936 /* setup the obj */ 937 __amdgpu_ras_feature_enable(adev, head, enable); 938 939 return 0; 940 } 941 942 /* Only used in device probe stage and called only once. */ 943 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 944 struct ras_common_if *head, bool enable) 945 { 946 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 947 int ret; 948 949 if (!con) 950 return -EINVAL; 951 952 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 953 if (enable) { 954 /* There is no harm to issue a ras TA cmd regardless of 955 * the currecnt ras state. 956 * If current state == target state, it will do nothing 957 * But sometimes it requests driver to reset and repost 958 * with error code -EAGAIN. 959 */ 960 ret = amdgpu_ras_feature_enable(adev, head, 1); 961 /* With old ras TA, we might fail to enable ras. 962 * Log it and just setup the object. 963 * TODO need remove this WA in the future. 964 */ 965 if (ret == -EINVAL) { 966 ret = __amdgpu_ras_feature_enable(adev, head, 1); 967 if (!ret) 968 dev_info(adev->dev, 969 "RAS INFO: %s setup object\n", 970 get_ras_block_str(head)); 971 } 972 } else { 973 /* setup the object then issue a ras TA disable cmd.*/ 974 ret = __amdgpu_ras_feature_enable(adev, head, 1); 975 if (ret) 976 return ret; 977 978 /* gfx block ras disable cmd must send to ras-ta */ 979 if (head->block == AMDGPU_RAS_BLOCK__GFX) 980 con->features |= BIT(head->block); 981 982 ret = amdgpu_ras_feature_enable(adev, head, 0); 983 984 /* clean gfx block ras features flag */ 985 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) 986 con->features &= ~BIT(head->block); 987 } 988 } else 989 ret = amdgpu_ras_feature_enable(adev, head, enable); 990 991 return ret; 992 } 993 994 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 995 bool bypass) 996 { 997 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 998 struct ras_manager *obj, *tmp; 999 1000 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1001 /* bypass psp. 1002 * aka just release the obj and corresponding flags 1003 */ 1004 if (bypass) { 1005 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 1006 break; 1007 } else { 1008 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 1009 break; 1010 } 1011 } 1012 1013 return con->features; 1014 } 1015 1016 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 1017 bool bypass) 1018 { 1019 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1020 int i; 1021 const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE; 1022 1023 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 1024 struct ras_common_if head = { 1025 .block = i, 1026 .type = default_ras_type, 1027 .sub_block_index = 0, 1028 }; 1029 1030 if (i == AMDGPU_RAS_BLOCK__MCA) 1031 continue; 1032 1033 if (bypass) { 1034 /* 1035 * bypass psp. vbios enable ras for us. 1036 * so just create the obj 1037 */ 1038 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 1039 break; 1040 } else { 1041 if (amdgpu_ras_feature_enable(adev, &head, 1)) 1042 break; 1043 } 1044 } 1045 1046 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 1047 struct ras_common_if head = { 1048 .block = AMDGPU_RAS_BLOCK__MCA, 1049 .type = default_ras_type, 1050 .sub_block_index = i, 1051 }; 1052 1053 if (bypass) { 1054 /* 1055 * bypass psp. vbios enable ras for us. 1056 * so just create the obj 1057 */ 1058 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 1059 break; 1060 } else { 1061 if (amdgpu_ras_feature_enable(adev, &head, 1)) 1062 break; 1063 } 1064 } 1065 1066 return con->features; 1067 } 1068 /* feature ctl end */ 1069 1070 static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, 1071 enum amdgpu_ras_block block) 1072 { 1073 if (!block_obj) 1074 return -EINVAL; 1075 1076 if (block_obj->ras_comm.block == block) 1077 return 0; 1078 1079 return -EINVAL; 1080 } 1081 1082 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, 1083 enum amdgpu_ras_block block, uint32_t sub_block_index) 1084 { 1085 struct amdgpu_ras_block_list *node, *tmp; 1086 struct amdgpu_ras_block_object *obj; 1087 1088 if (block >= AMDGPU_RAS_BLOCK__LAST) 1089 return NULL; 1090 1091 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 1092 if (!node->ras_obj) { 1093 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 1094 continue; 1095 } 1096 1097 obj = node->ras_obj; 1098 if (obj->ras_block_match) { 1099 if (obj->ras_block_match(obj, block, sub_block_index) == 0) 1100 return obj; 1101 } else { 1102 if (amdgpu_ras_block_match_default(obj, block) == 0) 1103 return obj; 1104 } 1105 } 1106 1107 return NULL; 1108 } 1109 1110 static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) 1111 { 1112 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1113 int ret = 0; 1114 1115 /* 1116 * choosing right query method according to 1117 * whether smu support query error information 1118 */ 1119 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); 1120 if (ret == -EOPNOTSUPP) { 1121 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 1122 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) 1123 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); 1124 1125 /* umc query_ras_error_address is also responsible for clearing 1126 * error status 1127 */ 1128 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 1129 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) 1130 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); 1131 } else if (!ret) { 1132 if (adev->umc.ras && 1133 adev->umc.ras->ecc_info_query_ras_error_count) 1134 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); 1135 1136 if (adev->umc.ras && 1137 adev->umc.ras->ecc_info_query_ras_error_address) 1138 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); 1139 } 1140 } 1141 1142 static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, 1143 struct ras_manager *ras_mgr, 1144 struct ras_err_data *err_data, 1145 struct ras_query_context *qctx, 1146 const char *blk_name, 1147 bool is_ue, 1148 bool is_de) 1149 { 1150 struct amdgpu_smuio_mcm_config_info *mcm_info; 1151 struct ras_err_node *err_node; 1152 struct ras_err_info *err_info; 1153 u64 event_id = qctx->evid.event_id; 1154 1155 if (is_ue) { 1156 for_each_ras_error(err_node, err_data) { 1157 err_info = &err_node->err_info; 1158 mcm_info = &err_info->mcm_info; 1159 if (err_info->ue_count) { 1160 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1161 "%lld new uncorrectable hardware errors detected in %s block\n", 1162 mcm_info->socket_id, 1163 mcm_info->die_id, 1164 err_info->ue_count, 1165 blk_name); 1166 } 1167 } 1168 1169 for_each_ras_error(err_node, &ras_mgr->err_data) { 1170 err_info = &err_node->err_info; 1171 mcm_info = &err_info->mcm_info; 1172 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1173 "%lld uncorrectable hardware errors detected in total in %s block\n", 1174 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); 1175 } 1176 1177 } else { 1178 if (is_de) { 1179 for_each_ras_error(err_node, err_data) { 1180 err_info = &err_node->err_info; 1181 mcm_info = &err_info->mcm_info; 1182 if (err_info->de_count) { 1183 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1184 "%lld new deferred hardware errors detected in %s block\n", 1185 mcm_info->socket_id, 1186 mcm_info->die_id, 1187 err_info->de_count, 1188 blk_name); 1189 } 1190 } 1191 1192 for_each_ras_error(err_node, &ras_mgr->err_data) { 1193 err_info = &err_node->err_info; 1194 mcm_info = &err_info->mcm_info; 1195 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1196 "%lld deferred hardware errors detected in total in %s block\n", 1197 mcm_info->socket_id, mcm_info->die_id, 1198 err_info->de_count, blk_name); 1199 } 1200 } else { 1201 if (adev->debug_disable_ce_logs) 1202 return; 1203 1204 for_each_ras_error(err_node, err_data) { 1205 err_info = &err_node->err_info; 1206 mcm_info = &err_info->mcm_info; 1207 if (err_info->ce_count) { 1208 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1209 "%lld new correctable hardware errors detected in %s block\n", 1210 mcm_info->socket_id, 1211 mcm_info->die_id, 1212 err_info->ce_count, 1213 blk_name); 1214 } 1215 } 1216 1217 for_each_ras_error(err_node, &ras_mgr->err_data) { 1218 err_info = &err_node->err_info; 1219 mcm_info = &err_info->mcm_info; 1220 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1221 "%lld correctable hardware errors detected in total in %s block\n", 1222 mcm_info->socket_id, mcm_info->die_id, 1223 err_info->ce_count, blk_name); 1224 } 1225 } 1226 } 1227 } 1228 1229 static inline bool err_data_has_source_info(struct ras_err_data *data) 1230 { 1231 return !list_empty(&data->err_node_list); 1232 } 1233 1234 static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, 1235 struct ras_query_if *query_if, 1236 struct ras_err_data *err_data, 1237 struct ras_query_context *qctx) 1238 { 1239 struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); 1240 const char *blk_name = get_ras_block_str(&query_if->head); 1241 u64 event_id = qctx->evid.event_id; 1242 1243 if (err_data->ce_count) { 1244 if (err_data_has_source_info(err_data)) { 1245 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1246 blk_name, false, false); 1247 } else if (!adev->aid_mask && 1248 adev->smuio.funcs && 1249 adev->smuio.funcs->get_socket_id && 1250 adev->smuio.funcs->get_die_id) { 1251 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1252 "%ld correctable hardware errors " 1253 "detected in %s block\n", 1254 adev->smuio.funcs->get_socket_id(adev), 1255 adev->smuio.funcs->get_die_id(adev), 1256 ras_mgr->err_data.ce_count, 1257 blk_name); 1258 } else { 1259 RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors " 1260 "detected in %s block\n", 1261 ras_mgr->err_data.ce_count, 1262 blk_name); 1263 } 1264 } 1265 1266 if (err_data->ue_count) { 1267 if (err_data_has_source_info(err_data)) { 1268 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1269 blk_name, true, false); 1270 } else if (!adev->aid_mask && 1271 adev->smuio.funcs && 1272 adev->smuio.funcs->get_socket_id && 1273 adev->smuio.funcs->get_die_id) { 1274 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1275 "%ld uncorrectable hardware errors " 1276 "detected in %s block\n", 1277 adev->smuio.funcs->get_socket_id(adev), 1278 adev->smuio.funcs->get_die_id(adev), 1279 ras_mgr->err_data.ue_count, 1280 blk_name); 1281 } else { 1282 RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors " 1283 "detected in %s block\n", 1284 ras_mgr->err_data.ue_count, 1285 blk_name); 1286 } 1287 } 1288 1289 if (err_data->de_count) { 1290 if (err_data_has_source_info(err_data)) { 1291 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1292 blk_name, false, true); 1293 } else if (!adev->aid_mask && 1294 adev->smuio.funcs && 1295 adev->smuio.funcs->get_socket_id && 1296 adev->smuio.funcs->get_die_id) { 1297 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1298 "%ld deferred hardware errors " 1299 "detected in %s block\n", 1300 adev->smuio.funcs->get_socket_id(adev), 1301 adev->smuio.funcs->get_die_id(adev), 1302 ras_mgr->err_data.de_count, 1303 blk_name); 1304 } else { 1305 RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors " 1306 "detected in %s block\n", 1307 ras_mgr->err_data.de_count, 1308 blk_name); 1309 } 1310 } 1311 } 1312 1313 static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev, 1314 struct ras_query_if *query_if, 1315 struct ras_err_data *err_data, 1316 struct ras_query_context *qctx) 1317 { 1318 unsigned long new_ue, new_ce, new_de; 1319 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); 1320 const char *blk_name = get_ras_block_str(&query_if->head); 1321 u64 event_id = qctx->evid.event_id; 1322 1323 new_ce = err_data->ce_count - obj->err_data.ce_count; 1324 new_ue = err_data->ue_count - obj->err_data.ue_count; 1325 new_de = err_data->de_count - obj->err_data.de_count; 1326 1327 if (new_ce) { 1328 RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors " 1329 "detected in %s block\n", 1330 new_ce, 1331 blk_name); 1332 } 1333 1334 if (new_ue) { 1335 RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors " 1336 "detected in %s block\n", 1337 new_ue, 1338 blk_name); 1339 } 1340 1341 if (new_de) { 1342 RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors " 1343 "detected in %s block\n", 1344 new_de, 1345 blk_name); 1346 } 1347 } 1348 1349 static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) 1350 { 1351 struct ras_err_node *err_node; 1352 struct ras_err_info *err_info; 1353 1354 if (err_data_has_source_info(err_data)) { 1355 for_each_ras_error(err_node, err_data) { 1356 err_info = &err_node->err_info; 1357 amdgpu_ras_error_statistic_de_count(&obj->err_data, 1358 &err_info->mcm_info, err_info->de_count); 1359 amdgpu_ras_error_statistic_ce_count(&obj->err_data, 1360 &err_info->mcm_info, err_info->ce_count); 1361 amdgpu_ras_error_statistic_ue_count(&obj->err_data, 1362 &err_info->mcm_info, err_info->ue_count); 1363 } 1364 } else { 1365 /* for legacy asic path which doesn't has error source info */ 1366 obj->err_data.ue_count += err_data->ue_count; 1367 obj->err_data.ce_count += err_data->ce_count; 1368 obj->err_data.de_count += err_data->de_count; 1369 } 1370 } 1371 1372 static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj, 1373 struct ras_err_data *err_data) 1374 { 1375 /* Host reports absolute counts */ 1376 obj->err_data.ue_count = err_data->ue_count; 1377 obj->err_data.ce_count = err_data->ce_count; 1378 obj->err_data.de_count = err_data->de_count; 1379 } 1380 1381 static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) 1382 { 1383 struct ras_common_if head; 1384 1385 memset(&head, 0, sizeof(head)); 1386 head.block = blk; 1387 1388 return amdgpu_ras_find_obj(adev, &head); 1389 } 1390 1391 int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 1392 const struct aca_info *aca_info, void *data) 1393 { 1394 struct ras_manager *obj; 1395 1396 /* in resume phase, no need to create aca fs node */ 1397 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) 1398 return 0; 1399 1400 obj = get_ras_manager(adev, blk); 1401 if (!obj) 1402 return -EINVAL; 1403 1404 return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); 1405 } 1406 1407 int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) 1408 { 1409 struct ras_manager *obj; 1410 1411 obj = get_ras_manager(adev, blk); 1412 if (!obj) 1413 return -EINVAL; 1414 1415 amdgpu_aca_remove_handle(&obj->aca_handle); 1416 1417 return 0; 1418 } 1419 1420 static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 1421 enum aca_error_type type, struct ras_err_data *err_data, 1422 struct ras_query_context *qctx) 1423 { 1424 struct ras_manager *obj; 1425 1426 obj = get_ras_manager(adev, blk); 1427 if (!obj) 1428 return -EINVAL; 1429 1430 return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); 1431 } 1432 1433 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, 1434 struct aca_handle *handle, char *buf, void *data) 1435 { 1436 struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle); 1437 struct ras_query_if info = { 1438 .head = obj->head, 1439 }; 1440 1441 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 1442 return sysfs_emit(buf, "Query currently inaccessible\n"); 1443 1444 if (amdgpu_ras_query_error_status(obj->adev, &info)) 1445 return -EINVAL; 1446 1447 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, 1448 "ce", info.ce_count, "de", info.de_count); 1449 } 1450 1451 static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, 1452 struct ras_query_if *info, 1453 struct ras_err_data *err_data, 1454 struct ras_query_context *qctx, 1455 unsigned int error_query_mode) 1456 { 1457 enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; 1458 struct amdgpu_ras_block_object *block_obj = NULL; 1459 int ret; 1460 1461 if (blk == AMDGPU_RAS_BLOCK_COUNT) 1462 return -EINVAL; 1463 1464 if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY) 1465 return -EINVAL; 1466 1467 if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1468 return amdgpu_virt_req_ras_err_count(adev, blk, err_data); 1469 } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { 1470 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { 1471 amdgpu_ras_get_ecc_info(adev, err_data); 1472 } else { 1473 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); 1474 if (!block_obj || !block_obj->hw_ops) { 1475 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1476 get_ras_block_str(&info->head)); 1477 return -EINVAL; 1478 } 1479 1480 if (block_obj->hw_ops->query_ras_error_count) 1481 block_obj->hw_ops->query_ras_error_count(adev, err_data); 1482 1483 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || 1484 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || 1485 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { 1486 if (block_obj->hw_ops->query_ras_error_status) 1487 block_obj->hw_ops->query_ras_error_status(adev); 1488 } 1489 } 1490 } else { 1491 if (amdgpu_aca_is_enabled(adev)) { 1492 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx); 1493 if (ret) 1494 return ret; 1495 1496 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx); 1497 if (ret) 1498 return ret; 1499 1500 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx); 1501 if (ret) 1502 return ret; 1503 } else { 1504 /* FIXME: add code to check return value later */ 1505 amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx); 1506 amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx); 1507 } 1508 } 1509 1510 return 0; 1511 } 1512 1513 /* query/inject/cure begin */ 1514 static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, 1515 struct ras_query_if *info, 1516 enum ras_event_type type) 1517 { 1518 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1519 struct ras_err_data err_data; 1520 struct ras_query_context qctx; 1521 unsigned int error_query_mode; 1522 int ret; 1523 1524 if (!obj) 1525 return -EINVAL; 1526 1527 ret = amdgpu_ras_error_data_init(&err_data); 1528 if (ret) 1529 return ret; 1530 1531 if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) 1532 return -EINVAL; 1533 1534 memset(&qctx, 0, sizeof(qctx)); 1535 qctx.evid.type = type; 1536 qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type); 1537 1538 if (!down_read_trylock(&adev->reset_domain->sem)) { 1539 ret = -EIO; 1540 goto out_fini_err_data; 1541 } 1542 1543 ret = amdgpu_ras_query_error_status_helper(adev, info, 1544 &err_data, 1545 &qctx, 1546 error_query_mode); 1547 up_read(&adev->reset_domain->sem); 1548 if (ret) 1549 goto out_fini_err_data; 1550 1551 if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1552 amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); 1553 amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); 1554 } else { 1555 /* Host provides absolute error counts. First generate the report 1556 * using the previous VF internal count against new host count. 1557 * Then Update VF internal count. 1558 */ 1559 amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx); 1560 amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data); 1561 } 1562 1563 info->ue_count = obj->err_data.ue_count; 1564 info->ce_count = obj->err_data.ce_count; 1565 info->de_count = obj->err_data.de_count; 1566 1567 out_fini_err_data: 1568 amdgpu_ras_error_data_fini(&err_data); 1569 1570 return ret; 1571 } 1572 1573 static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev) 1574 { 1575 struct ras_cmd_dev_handle req = {0}; 1576 int ret; 1577 1578 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CLEAR_BAD_PAGE_INFO, 1579 &req, sizeof(req), NULL, 0); 1580 if (ret) { 1581 dev_err(adev->dev, "Failed to clear bad pages info, ret: %d\n", ret); 1582 return ret; 1583 } 1584 1585 return 0; 1586 } 1587 1588 static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev, 1589 struct ras_query_if *info) 1590 { 1591 struct ras_cmd_block_ecc_info_req req = {0}; 1592 struct ras_cmd_block_ecc_info_rsp rsp = {0}; 1593 int ret; 1594 1595 if (!info) 1596 return -EINVAL; 1597 1598 req.block_id = info->head.block; 1599 req.subblock_id = info->head.sub_block_index; 1600 1601 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS, 1602 &req, sizeof(req), &rsp, sizeof(rsp)); 1603 if (!ret) { 1604 info->ce_count = rsp.ce_count; 1605 info->ue_count = rsp.ue_count; 1606 info->de_count = rsp.de_count; 1607 } 1608 1609 return ret; 1610 } 1611 1612 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) 1613 { 1614 if (amdgpu_uniras_enabled(adev)) 1615 return amdgpu_uniras_query_block_ecc(adev, info); 1616 else 1617 return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); 1618 } 1619 1620 int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, 1621 enum amdgpu_ras_block block) 1622 { 1623 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 1624 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 1625 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 1626 1627 if (!block_obj || !block_obj->hw_ops) { 1628 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1629 ras_block_str(block)); 1630 return -EOPNOTSUPP; 1631 } 1632 1633 if (!amdgpu_ras_is_supported(adev, block) || 1634 !amdgpu_ras_get_aca_debug_mode(adev)) 1635 return -EOPNOTSUPP; 1636 1637 if (amdgpu_sriov_vf(adev)) 1638 return -EOPNOTSUPP; 1639 1640 /* skip ras error reset in gpu reset */ 1641 if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && 1642 ((smu_funcs && smu_funcs->set_debug_mode) || 1643 (mca_funcs && mca_funcs->mca_set_debug_mode))) 1644 return -EOPNOTSUPP; 1645 1646 if (block_obj->hw_ops->reset_ras_error_count) 1647 block_obj->hw_ops->reset_ras_error_count(adev); 1648 1649 return 0; 1650 } 1651 1652 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, 1653 enum amdgpu_ras_block block) 1654 { 1655 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 1656 1657 if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) 1658 return 0; 1659 1660 if ((block == AMDGPU_RAS_BLOCK__GFX) || 1661 (block == AMDGPU_RAS_BLOCK__MMHUB)) { 1662 if (block_obj->hw_ops->reset_ras_error_status) 1663 block_obj->hw_ops->reset_ras_error_status(adev); 1664 } 1665 1666 return 0; 1667 } 1668 1669 static int amdgpu_uniras_error_inject(struct amdgpu_device *adev, 1670 struct ras_inject_if *info) 1671 { 1672 struct ras_cmd_inject_error_req inject_req; 1673 struct ras_cmd_inject_error_rsp rsp; 1674 1675 if (!info) 1676 return -EINVAL; 1677 1678 memset(&inject_req, 0, sizeof(inject_req)); 1679 inject_req.block_id = info->head.block; 1680 inject_req.subblock_id = info->head.sub_block_index; 1681 inject_req.address = info->address; 1682 inject_req.error_type = info->head.type; 1683 inject_req.instance_mask = info->instance_mask; 1684 inject_req.method = info->value; 1685 1686 return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR, 1687 &inject_req, sizeof(inject_req), &rsp, sizeof(rsp)); 1688 } 1689 1690 /* wrapper of psp_ras_trigger_error */ 1691 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 1692 struct ras_inject_if *info) 1693 { 1694 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1695 struct ta_ras_trigger_error_input block_info = { 1696 .block_id = amdgpu_ras_block_to_ta(info->head.block), 1697 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 1698 .sub_block_index = info->head.sub_block_index, 1699 .address = info->address, 1700 .value = info->value, 1701 }; 1702 int ret = -EINVAL; 1703 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, 1704 info->head.block, 1705 info->head.sub_block_index); 1706 1707 if (amdgpu_uniras_enabled(adev)) 1708 return amdgpu_uniras_error_inject(adev, info); 1709 1710 /* inject on guest isn't allowed, return success directly */ 1711 if (amdgpu_sriov_vf(adev)) 1712 return 0; 1713 1714 if (!obj) 1715 return -EINVAL; 1716 1717 if (!block_obj || !block_obj->hw_ops) { 1718 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1719 get_ras_block_str(&info->head)); 1720 return -EINVAL; 1721 } 1722 1723 /* Calculate XGMI relative offset */ 1724 if (adev->gmc.xgmi.num_physical_nodes > 1 && 1725 info->head.block != AMDGPU_RAS_BLOCK__GFX) { 1726 block_info.address = 1727 amdgpu_xgmi_get_relative_phy_addr(adev, 1728 block_info.address); 1729 } 1730 1731 if (block_obj->hw_ops->ras_error_inject) { 1732 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) 1733 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); 1734 else /* Special ras_error_inject is defined (e.g: xgmi) */ 1735 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, 1736 info->instance_mask); 1737 } else { 1738 /* default path */ 1739 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); 1740 } 1741 1742 if (ret) 1743 dev_err(adev->dev, "ras inject %s failed %d\n", 1744 get_ras_block_str(&info->head), ret); 1745 1746 return ret; 1747 } 1748 1749 /** 1750 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP 1751 * @adev: pointer to AMD GPU device 1752 * @ce_count: pointer to an integer to be set to the count of correctible errors. 1753 * @ue_count: pointer to an integer to be set to the count of uncorrectible errors. 1754 * @query_info: pointer to ras_query_if 1755 * 1756 * Return 0 for query success or do nothing, otherwise return an error 1757 * on failures 1758 */ 1759 static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, 1760 unsigned long *ce_count, 1761 unsigned long *ue_count, 1762 struct ras_query_if *query_info) 1763 { 1764 int ret; 1765 1766 if (!query_info) 1767 /* do nothing if query_info is not specified */ 1768 return 0; 1769 1770 ret = amdgpu_ras_query_error_status(adev, query_info); 1771 if (ret) 1772 return ret; 1773 1774 *ce_count += query_info->ce_count; 1775 *ue_count += query_info->ue_count; 1776 1777 /* some hardware/IP supports read to clear 1778 * no need to explictly reset the err status after the query call */ 1779 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 1780 amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 1781 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) 1782 dev_warn(adev->dev, 1783 "Failed to reset error counter and error status\n"); 1784 } 1785 1786 return 0; 1787 } 1788 1789 /** 1790 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP 1791 * @adev: pointer to AMD GPU device 1792 * @ce_count: pointer to an integer to be set to the count of correctible errors. 1793 * @ue_count: pointer to an integer to be set to the count of uncorrectible 1794 * errors. 1795 * @query_info: pointer to ras_query_if if the query request is only for 1796 * specific ip block; if info is NULL, then the qurey request is for 1797 * all the ip blocks that support query ras error counters/status 1798 * 1799 * If set, @ce_count or @ue_count, count and return the corresponding 1800 * error counts in those integer pointers. Return 0 if the device 1801 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS. 1802 */ 1803 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 1804 unsigned long *ce_count, 1805 unsigned long *ue_count, 1806 struct ras_query_if *query_info) 1807 { 1808 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1809 struct ras_manager *obj; 1810 unsigned long ce, ue; 1811 int ret; 1812 1813 if (!adev->ras_enabled || !con) 1814 return -EOPNOTSUPP; 1815 1816 /* Don't count since no reporting. 1817 */ 1818 if (!ce_count && !ue_count) 1819 return 0; 1820 1821 ce = 0; 1822 ue = 0; 1823 if (!query_info) { 1824 /* query all the ip blocks that support ras query interface */ 1825 list_for_each_entry(obj, &con->head, node) { 1826 struct ras_query_if info = { 1827 .head = obj->head, 1828 }; 1829 1830 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); 1831 } 1832 } else { 1833 /* query specific ip block */ 1834 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); 1835 } 1836 1837 if (ret) 1838 return ret; 1839 1840 if (ce_count) 1841 *ce_count = ce; 1842 1843 if (ue_count) 1844 *ue_count = ue; 1845 1846 return 0; 1847 } 1848 /* query/inject/cure end */ 1849 1850 1851 /* sysfs begin */ 1852 1853 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1854 struct ras_badpage *bps, uint32_t count, uint32_t start); 1855 static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, 1856 struct ras_badpage *bps, uint32_t count, uint32_t start); 1857 1858 static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 1859 { 1860 switch (flags) { 1861 case AMDGPU_RAS_RETIRE_PAGE_RESERVED: 1862 return "R"; 1863 case AMDGPU_RAS_RETIRE_PAGE_PENDING: 1864 return "P"; 1865 case AMDGPU_RAS_RETIRE_PAGE_FAULT: 1866 default: 1867 return "F"; 1868 } 1869 } 1870 1871 /** 1872 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface 1873 * 1874 * It allows user to read the bad pages of vram on the gpu through 1875 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 1876 * 1877 * It outputs multiple lines, and each line stands for one gpu page. 1878 * 1879 * The format of one line is below, 1880 * gpu pfn : gpu page size : flags 1881 * 1882 * gpu pfn and gpu page size are printed in hex format. 1883 * flags can be one of below character, 1884 * 1885 * R: reserved, this gpu page is reserved and not able to use. 1886 * 1887 * P: pending for reserve, this gpu page is marked as bad, will be reserved 1888 * in next window of page_reserve. 1889 * 1890 * F: unable to reserve. this gpu page can't be reserved due to some reasons. 1891 * 1892 * Examples: 1893 * 1894 * .. code-block:: bash 1895 * 1896 * 0x00000001 : 0x00001000 : R 1897 * 0x00000002 : 0x00001000 : P 1898 * 1899 */ 1900 1901 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 1902 struct kobject *kobj, const struct bin_attribute *attr, 1903 char *buf, loff_t ppos, size_t count) 1904 { 1905 struct amdgpu_ras *con = 1906 container_of(attr, struct amdgpu_ras, badpages_attr); 1907 struct amdgpu_device *adev = con->adev; 1908 const unsigned int element_size = 1909 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 1910 unsigned int start = div64_ul(ppos + element_size - 1, element_size); 1911 unsigned int end = div64_ul(ppos + count - 1, element_size); 1912 ssize_t s = 0; 1913 struct ras_badpage *bps = NULL; 1914 int bps_count = 0, i, status; 1915 uint64_t address; 1916 1917 memset(buf, 0, count); 1918 1919 bps_count = end - start; 1920 bps = kmalloc_objs(*bps, bps_count); 1921 if (!bps) 1922 return 0; 1923 1924 memset(bps, 0, sizeof(*bps) * bps_count); 1925 1926 if (amdgpu_uniras_enabled(adev)) 1927 bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start); 1928 else 1929 bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start); 1930 1931 if (bps_count <= 0) { 1932 kfree(bps); 1933 return 0; 1934 } 1935 1936 for (i = 0; i < bps_count; i++) { 1937 address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT; 1938 1939 bps[i].size = AMDGPU_GPU_PAGE_SIZE; 1940 1941 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, 1942 address); 1943 if (status == -EBUSY) 1944 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; 1945 else if (status == -ENOENT) 1946 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; 1947 else 1948 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; 1949 1950 if ((bps[i].flags != AMDGPU_RAS_RETIRE_PAGE_RESERVED) && 1951 amdgpu_ras_check_critical_address(adev, address)) 1952 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; 1953 1954 s += scnprintf(&buf[s], element_size + 1, 1955 "0x%08x : 0x%08x : %1s\n", 1956 bps[i].bp, 1957 bps[i].size, 1958 amdgpu_ras_badpage_flags_str(bps[i].flags)); 1959 } 1960 1961 kfree(bps); 1962 1963 return s; 1964 } 1965 1966 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 1967 struct device_attribute *attr, char *buf) 1968 { 1969 struct amdgpu_ras *con = 1970 container_of(attr, struct amdgpu_ras, features_attr); 1971 1972 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); 1973 } 1974 1975 static bool amdgpu_ras_get_version_info(struct amdgpu_device *adev, u32 *major, 1976 u32 *minor, u32 *rev) 1977 { 1978 int i; 1979 1980 if (!adev || !major || !minor || !rev || !amdgpu_uniras_enabled(adev)) 1981 return false; 1982 1983 for (i = 0; i < adev->num_ip_blocks; i++) { 1984 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_RAS) { 1985 *major = adev->ip_blocks[i].version->major; 1986 *minor = adev->ip_blocks[i].version->minor; 1987 *rev = adev->ip_blocks[i].version->rev; 1988 return true; 1989 } 1990 } 1991 1992 return false; 1993 } 1994 1995 static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev, 1996 struct device_attribute *attr, char *buf) 1997 { 1998 struct amdgpu_ras *con = 1999 container_of(attr, struct amdgpu_ras, version_attr); 2000 u32 major, minor, rev; 2001 ssize_t size = 0; 2002 2003 size += sysfs_emit_at(buf, size, "table version: 0x%x\n", 2004 con->eeprom_control.tbl_hdr.version); 2005 2006 if (amdgpu_ras_get_version_info(con->adev, &major, &minor, &rev)) 2007 size += sysfs_emit_at(buf, size, "ras version: %u.%u.%u\n", 2008 major, minor, rev); 2009 2010 return size; 2011 } 2012 2013 static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, 2014 struct device_attribute *attr, char *buf) 2015 { 2016 struct amdgpu_ras *con = 2017 container_of(attr, struct amdgpu_ras, schema_attr); 2018 return sysfs_emit(buf, "schema: 0x%x\n", con->schema); 2019 } 2020 2021 static struct { 2022 enum ras_event_type type; 2023 const char *name; 2024 } dump_event[] = { 2025 {RAS_EVENT_TYPE_FATAL, "Fatal Error"}, 2026 {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"}, 2027 {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, 2028 }; 2029 2030 static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev, 2031 struct device_attribute *attr, char *buf) 2032 { 2033 struct amdgpu_ras *con = 2034 container_of(attr, struct amdgpu_ras, event_state_attr); 2035 struct ras_event_manager *event_mgr = con->event_mgr; 2036 struct ras_event_state *event_state; 2037 int i, size = 0; 2038 2039 if (!event_mgr) 2040 return -EINVAL; 2041 2042 size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); 2043 for (i = 0; i < ARRAY_SIZE(dump_event); i++) { 2044 event_state = &event_mgr->event_state[dump_event[i].type]; 2045 size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n", 2046 dump_event[i].name, 2047 atomic64_read(&event_state->count), 2048 event_state->last_seqno); 2049 } 2050 2051 return (ssize_t)size; 2052 } 2053 2054 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) 2055 { 2056 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2057 2058 if (adev->dev->kobj.sd) 2059 sysfs_remove_file_from_group(&adev->dev->kobj, 2060 &con->badpages_attr.attr, 2061 RAS_FS_NAME); 2062 } 2063 2064 static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev) 2065 { 2066 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2067 struct attribute *attrs[] = { 2068 &con->features_attr.attr, 2069 &con->version_attr.attr, 2070 &con->schema_attr.attr, 2071 &con->event_state_attr.attr, 2072 NULL 2073 }; 2074 struct attribute_group group = { 2075 .name = RAS_FS_NAME, 2076 .attrs = attrs, 2077 }; 2078 2079 if (adev->dev->kobj.sd) 2080 sysfs_remove_group(&adev->dev->kobj, &group); 2081 2082 return 0; 2083 } 2084 2085 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 2086 struct ras_common_if *head) 2087 { 2088 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2089 2090 if (amdgpu_aca_is_enabled(adev)) 2091 return 0; 2092 2093 if (!obj || obj->attr_inuse) 2094 return -EINVAL; 2095 2096 if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block)) 2097 return 0; 2098 2099 get_obj(obj); 2100 2101 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), 2102 "%s_err_count", head->name); 2103 2104 obj->sysfs_attr = (struct device_attribute){ 2105 .attr = { 2106 .name = obj->fs_data.sysfs_name, 2107 .mode = S_IRUGO, 2108 }, 2109 .show = amdgpu_ras_sysfs_read, 2110 }; 2111 sysfs_attr_init(&obj->sysfs_attr.attr); 2112 2113 if (sysfs_add_file_to_group(&adev->dev->kobj, 2114 &obj->sysfs_attr.attr, 2115 RAS_FS_NAME)) { 2116 put_obj(obj); 2117 return -EINVAL; 2118 } 2119 2120 obj->attr_inuse = 1; 2121 2122 return 0; 2123 } 2124 2125 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 2126 struct ras_common_if *head) 2127 { 2128 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2129 2130 if (amdgpu_aca_is_enabled(adev)) 2131 return 0; 2132 2133 if (!obj || !obj->attr_inuse) 2134 return -EINVAL; 2135 2136 if (adev->dev->kobj.sd) 2137 sysfs_remove_file_from_group(&adev->dev->kobj, 2138 &obj->sysfs_attr.attr, 2139 RAS_FS_NAME); 2140 obj->attr_inuse = 0; 2141 put_obj(obj); 2142 2143 return 0; 2144 } 2145 2146 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 2147 { 2148 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2149 struct ras_manager *obj, *tmp; 2150 2151 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2152 amdgpu_ras_sysfs_remove(adev, &obj->head); 2153 } 2154 2155 if (amdgpu_bad_page_threshold != 0) 2156 amdgpu_ras_sysfs_remove_bad_page_node(adev); 2157 2158 amdgpu_ras_sysfs_remove_dev_attr_node(adev); 2159 2160 return 0; 2161 } 2162 /* sysfs end */ 2163 2164 /** 2165 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors 2166 * 2167 * Normally when there is an uncorrectable error, the driver will reset 2168 * the GPU to recover. However, in the event of an unrecoverable error, 2169 * the driver provides an interface to reboot the system automatically 2170 * in that event. 2171 * 2172 * The following file in debugfs provides that interface: 2173 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot 2174 * 2175 * Usage: 2176 * 2177 * .. code-block:: bash 2178 * 2179 * echo true > .../ras/auto_reboot 2180 * 2181 */ 2182 /* debugfs begin */ 2183 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 2184 { 2185 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2186 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; 2187 struct drm_minor *minor = adev_to_drm(adev)->primary; 2188 struct dentry *dir; 2189 2190 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); 2191 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, 2192 &amdgpu_ras_debugfs_ctrl_ops); 2193 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, 2194 &amdgpu_ras_debugfs_eeprom_ops); 2195 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, 2196 &con->bad_page_cnt_threshold); 2197 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); 2198 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); 2199 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); 2200 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, 2201 &amdgpu_ras_debugfs_eeprom_size_ops); 2202 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", 2203 S_IRUGO, dir, adev, 2204 &amdgpu_ras_debugfs_eeprom_table_ops); 2205 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); 2206 2207 /* 2208 * After one uncorrectable error happens, usually GPU recovery will 2209 * be scheduled. But due to the known problem in GPU recovery failing 2210 * to bring GPU back, below interface provides one direct way to 2211 * user to reboot system automatically in such case within 2212 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine 2213 * will never be called. 2214 */ 2215 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); 2216 2217 /* 2218 * User could set this not to clean up hardware's error count register 2219 * of RAS IPs during ras recovery. 2220 */ 2221 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, 2222 &con->disable_ras_err_cnt_harvest); 2223 return dir; 2224 } 2225 2226 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 2227 struct ras_fs_if *head, 2228 struct dentry *dir) 2229 { 2230 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 2231 2232 if (!obj || !dir) 2233 return; 2234 2235 get_obj(obj); 2236 2237 memcpy(obj->fs_data.debugfs_name, 2238 head->debugfs_name, 2239 sizeof(obj->fs_data.debugfs_name)); 2240 2241 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, 2242 obj, &amdgpu_ras_debugfs_ops); 2243 } 2244 2245 static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) 2246 { 2247 bool ret; 2248 2249 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 2250 case IP_VERSION(13, 0, 6): 2251 case IP_VERSION(13, 0, 12): 2252 case IP_VERSION(13, 0, 14): 2253 case IP_VERSION(13, 0, 15): 2254 ret = true; 2255 break; 2256 default: 2257 ret = false; 2258 break; 2259 } 2260 2261 return ret; 2262 } 2263 2264 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) 2265 { 2266 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2267 struct dentry *dir; 2268 struct ras_manager *obj; 2269 struct ras_fs_if fs_info; 2270 2271 /* 2272 * it won't be called in resume path, no need to check 2273 * suspend and gpu reset status 2274 */ 2275 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) 2276 return; 2277 2278 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); 2279 2280 list_for_each_entry(obj, &con->head, node) { 2281 if (amdgpu_ras_is_supported(adev, obj->head.block) && 2282 (obj->attr_inuse == 1)) { 2283 snprintf(fs_info.debugfs_name, sizeof(fs_info.debugfs_name), 2284 "%s_err_inject", 2285 get_ras_block_str(&obj->head)); 2286 fs_info.head = obj->head; 2287 amdgpu_ras_debugfs_create(adev, &fs_info, dir); 2288 } 2289 } 2290 2291 if (amdgpu_ras_aca_is_supported(adev)) { 2292 if (amdgpu_aca_is_enabled(adev)) 2293 amdgpu_aca_smu_debugfs_init(adev, dir); 2294 else 2295 amdgpu_mca_smu_debugfs_init(adev, dir); 2296 } 2297 } 2298 2299 /* debugfs end */ 2300 2301 /* ras fs */ 2302 static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, 2303 amdgpu_ras_sysfs_badpages_read, NULL, 0); 2304 static DEVICE_ATTR(features, S_IRUGO, 2305 amdgpu_ras_sysfs_features_read, NULL); 2306 static DEVICE_ATTR(version, 0444, 2307 amdgpu_ras_sysfs_version_show, NULL); 2308 static DEVICE_ATTR(schema, 0444, 2309 amdgpu_ras_sysfs_schema_show, NULL); 2310 static DEVICE_ATTR(event_state, 0444, 2311 amdgpu_ras_sysfs_event_state_show, NULL); 2312 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 2313 { 2314 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2315 struct attribute_group group = { 2316 .name = RAS_FS_NAME, 2317 }; 2318 struct attribute *attrs[] = { 2319 &con->features_attr.attr, 2320 &con->version_attr.attr, 2321 &con->schema_attr.attr, 2322 &con->event_state_attr.attr, 2323 NULL 2324 }; 2325 const struct bin_attribute *bin_attrs[] = { 2326 NULL, 2327 NULL, 2328 }; 2329 int r; 2330 2331 group.attrs = attrs; 2332 2333 /* add features entry */ 2334 con->features_attr = dev_attr_features; 2335 sysfs_attr_init(attrs[0]); 2336 2337 /* add version entry */ 2338 con->version_attr = dev_attr_version; 2339 sysfs_attr_init(attrs[1]); 2340 2341 /* add schema entry */ 2342 con->schema_attr = dev_attr_schema; 2343 sysfs_attr_init(attrs[2]); 2344 2345 /* add event_state entry */ 2346 con->event_state_attr = dev_attr_event_state; 2347 sysfs_attr_init(attrs[3]); 2348 2349 if (amdgpu_bad_page_threshold != 0) { 2350 /* add bad_page_features entry */ 2351 con->badpages_attr = bin_attr_gpu_vram_bad_pages; 2352 sysfs_bin_attr_init(&con->badpages_attr); 2353 bin_attrs[0] = &con->badpages_attr; 2354 group.bin_attrs = bin_attrs; 2355 } 2356 2357 r = sysfs_create_group(&adev->dev->kobj, &group); 2358 if (r) 2359 dev_err(adev->dev, "Failed to create RAS sysfs group!"); 2360 2361 return 0; 2362 } 2363 2364 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 2365 { 2366 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2367 struct ras_manager *con_obj, *ip_obj, *tmp; 2368 2369 if (IS_ENABLED(CONFIG_DEBUG_FS)) { 2370 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { 2371 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); 2372 if (ip_obj) 2373 put_obj(ip_obj); 2374 } 2375 } 2376 2377 amdgpu_ras_sysfs_remove_all(adev); 2378 return 0; 2379 } 2380 /* ras fs end */ 2381 2382 /* ih begin */ 2383 2384 /* For the hardware that cannot enable bif ring for both ras_controller_irq 2385 * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status 2386 * register to check whether the interrupt is triggered or not, and properly 2387 * ack the interrupt if it is there 2388 */ 2389 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) 2390 { 2391 /* Fatal error events are handled on host side */ 2392 if (amdgpu_sriov_vf(adev)) 2393 return; 2394 /* 2395 * If the current interrupt is caused by a non-fatal RAS error, skip 2396 * check for fatal error. For fatal errors, FED status of all devices 2397 * in XGMI hive gets set when the first device gets fatal error 2398 * interrupt. The error gets propagated to other devices as well, so 2399 * make sure to ack the interrupt regardless of FED status. 2400 */ 2401 if (!amdgpu_ras_get_fed_status(adev) && 2402 amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) 2403 return; 2404 2405 if (amdgpu_uniras_enabled(adev)) { 2406 amdgpu_ras_mgr_handle_fatal_interrupt(adev, NULL); 2407 return; 2408 } 2409 2410 if (adev->nbio.ras && 2411 adev->nbio.ras->handle_ras_controller_intr_no_bifring) 2412 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); 2413 2414 if (adev->nbio.ras && 2415 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) 2416 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); 2417 } 2418 2419 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, 2420 struct amdgpu_iv_entry *entry) 2421 { 2422 bool poison_stat = false; 2423 struct amdgpu_device *adev = obj->adev; 2424 struct amdgpu_ras_block_object *block_obj = 2425 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); 2426 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2427 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; 2428 u64 event_id; 2429 int ret; 2430 2431 if (!block_obj || !con) 2432 return; 2433 2434 ret = amdgpu_ras_mark_ras_event(adev, type); 2435 if (ret) 2436 return; 2437 2438 amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); 2439 /* both query_poison_status and handle_poison_consumption are optional, 2440 * but at least one of them should be implemented if we need poison 2441 * consumption handler 2442 */ 2443 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { 2444 poison_stat = block_obj->hw_ops->query_poison_status(adev); 2445 if (!poison_stat) { 2446 /* Not poison consumption interrupt, no need to handle it */ 2447 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", 2448 block_obj->ras_comm.name); 2449 2450 return; 2451 } 2452 } 2453 2454 amdgpu_umc_poison_handler(adev, obj->head.block, 0); 2455 2456 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) 2457 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); 2458 2459 /* gpu reset is fallback for failed and default cases. 2460 * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. 2461 */ 2462 if (poison_stat && !amdgpu_ras_is_rma(adev)) { 2463 event_id = amdgpu_ras_acquire_event_id(adev, type); 2464 RAS_EVENT_LOG(adev, event_id, 2465 "GPU reset for %s RAS poison consumption is issued!\n", 2466 block_obj->ras_comm.name); 2467 amdgpu_ras_reset_gpu(adev); 2468 } 2469 2470 if (!poison_stat) 2471 amdgpu_gfx_poison_consumption_handler(adev, entry); 2472 } 2473 2474 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, 2475 struct amdgpu_iv_entry *entry) 2476 { 2477 struct amdgpu_device *adev = obj->adev; 2478 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 2479 u64 event_id; 2480 int ret; 2481 2482 ret = amdgpu_ras_mark_ras_event(adev, type); 2483 if (ret) 2484 return; 2485 2486 event_id = amdgpu_ras_acquire_event_id(adev, type); 2487 RAS_EVENT_LOG(adev, event_id, "Poison is created\n"); 2488 2489 if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { 2490 struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); 2491 2492 atomic_inc(&con->page_retirement_req_cnt); 2493 atomic_inc(&con->poison_creation_count); 2494 2495 wake_up(&con->page_retirement_wq); 2496 } 2497 } 2498 2499 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, 2500 struct amdgpu_iv_entry *entry) 2501 { 2502 struct ras_ih_data *data = &obj->ih_data; 2503 struct ras_err_data err_data; 2504 int ret; 2505 2506 if (!data->cb) 2507 return; 2508 2509 ret = amdgpu_ras_error_data_init(&err_data); 2510 if (ret) 2511 return; 2512 2513 /* Let IP handle its data, maybe we need get the output 2514 * from the callback to update the error type/count, etc 2515 */ 2516 amdgpu_ras_set_fed(obj->adev, true); 2517 ret = data->cb(obj->adev, &err_data, entry); 2518 /* ue will trigger an interrupt, and in that case 2519 * we need do a reset to recovery the whole system. 2520 * But leave IP do that recovery, here we just dispatch 2521 * the error. 2522 */ 2523 if (ret == AMDGPU_RAS_SUCCESS) { 2524 /* these counts could be left as 0 if 2525 * some blocks do not count error number 2526 */ 2527 obj->err_data.ue_count += err_data.ue_count; 2528 obj->err_data.ce_count += err_data.ce_count; 2529 obj->err_data.de_count += err_data.de_count; 2530 } 2531 2532 amdgpu_ras_error_data_fini(&err_data); 2533 } 2534 2535 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 2536 { 2537 struct ras_ih_data *data = &obj->ih_data; 2538 struct amdgpu_iv_entry entry; 2539 2540 while (data->rptr != data->wptr) { 2541 rmb(); 2542 memcpy(&entry, &data->ring[data->rptr], 2543 data->element_size); 2544 2545 wmb(); 2546 data->rptr = (data->aligned_element_size + 2547 data->rptr) % data->ring_size; 2548 2549 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { 2550 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 2551 amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); 2552 else 2553 amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); 2554 } else { 2555 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 2556 amdgpu_ras_interrupt_umc_handler(obj, &entry); 2557 else 2558 dev_warn(obj->adev->dev, 2559 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); 2560 } 2561 } 2562 } 2563 2564 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 2565 { 2566 struct ras_ih_data *data = 2567 container_of(work, struct ras_ih_data, ih_work); 2568 struct ras_manager *obj = 2569 container_of(data, struct ras_manager, ih_data); 2570 2571 amdgpu_ras_interrupt_handler(obj); 2572 } 2573 2574 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 2575 struct ras_dispatch_if *info) 2576 { 2577 struct ras_manager *obj; 2578 struct ras_ih_data *data; 2579 2580 if (amdgpu_uniras_enabled(adev)) { 2581 struct ras_ih_info ih_info; 2582 2583 memset(&ih_info, 0, sizeof(ih_info)); 2584 ih_info.block = info->head.block; 2585 memcpy(&ih_info.iv_entry, info->entry, sizeof(struct amdgpu_iv_entry)); 2586 2587 return amdgpu_ras_mgr_handle_controller_interrupt(adev, &ih_info); 2588 } 2589 2590 obj = amdgpu_ras_find_obj(adev, &info->head); 2591 if (!obj) 2592 return -EINVAL; 2593 2594 data = &obj->ih_data; 2595 2596 if (data->inuse == 0) 2597 return 0; 2598 2599 /* Might be overflow... */ 2600 memcpy(&data->ring[data->wptr], info->entry, 2601 data->element_size); 2602 2603 wmb(); 2604 data->wptr = (data->aligned_element_size + 2605 data->wptr) % data->ring_size; 2606 2607 schedule_work(&data->ih_work); 2608 2609 return 0; 2610 } 2611 2612 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 2613 struct ras_common_if *head) 2614 { 2615 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2616 struct ras_ih_data *data; 2617 2618 if (!obj) 2619 return -EINVAL; 2620 2621 data = &obj->ih_data; 2622 if (data->inuse == 0) 2623 return 0; 2624 2625 cancel_work_sync(&data->ih_work); 2626 2627 kfree(data->ring); 2628 memset(data, 0, sizeof(*data)); 2629 put_obj(obj); 2630 2631 return 0; 2632 } 2633 2634 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 2635 struct ras_common_if *head) 2636 { 2637 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2638 struct ras_ih_data *data; 2639 struct amdgpu_ras_block_object *ras_obj; 2640 2641 if (!obj) { 2642 /* in case we registe the IH before enable ras feature */ 2643 obj = amdgpu_ras_create_obj(adev, head); 2644 if (!obj) 2645 return -EINVAL; 2646 } else 2647 get_obj(obj); 2648 2649 ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); 2650 2651 data = &obj->ih_data; 2652 /* add the callback.etc */ 2653 *data = (struct ras_ih_data) { 2654 .inuse = 0, 2655 .cb = ras_obj->ras_cb, 2656 .element_size = sizeof(struct amdgpu_iv_entry), 2657 .rptr = 0, 2658 .wptr = 0, 2659 }; 2660 2661 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 2662 2663 data->aligned_element_size = ALIGN(data->element_size, 8); 2664 /* the ring can store 64 iv entries. */ 2665 data->ring_size = 64 * data->aligned_element_size; 2666 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 2667 if (!data->ring) { 2668 put_obj(obj); 2669 return -ENOMEM; 2670 } 2671 2672 /* IH is ready */ 2673 data->inuse = 1; 2674 2675 return 0; 2676 } 2677 2678 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 2679 { 2680 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2681 struct ras_manager *obj, *tmp; 2682 2683 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2684 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); 2685 } 2686 2687 return 0; 2688 } 2689 /* ih end */ 2690 2691 /* traversal all IPs except NBIO to query error counter */ 2692 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type) 2693 { 2694 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2695 struct ras_manager *obj; 2696 2697 if (!adev->ras_enabled || !con) 2698 return; 2699 2700 list_for_each_entry(obj, &con->head, node) { 2701 struct ras_query_if info = { 2702 .head = obj->head, 2703 }; 2704 2705 /* 2706 * PCIE_BIF IP has one different isr by ras controller 2707 * interrupt, the specific ras counter query will be 2708 * done in that isr. So skip such block from common 2709 * sync flood interrupt isr calling. 2710 */ 2711 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) 2712 continue; 2713 2714 /* 2715 * this is a workaround for aldebaran, skip send msg to 2716 * smu to get ecc_info table due to smu handle get ecc 2717 * info table failed temporarily. 2718 * should be removed until smu fix handle ecc_info table. 2719 */ 2720 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && 2721 (amdgpu_ip_version(adev, MP1_HWIP, 0) == 2722 IP_VERSION(13, 0, 2))) 2723 continue; 2724 2725 amdgpu_ras_query_error_status_with_event(adev, &info, type); 2726 2727 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != 2728 IP_VERSION(11, 0, 2) && 2729 amdgpu_ip_version(adev, MP0_HWIP, 0) != 2730 IP_VERSION(11, 0, 4) && 2731 amdgpu_ip_version(adev, MP0_HWIP, 0) != 2732 IP_VERSION(13, 0, 0)) { 2733 if (amdgpu_ras_reset_error_status(adev, info.head.block)) 2734 dev_warn(adev->dev, "Failed to reset error counter and error status"); 2735 } 2736 } 2737 } 2738 2739 /* Parse RdRspStatus and WrRspStatus */ 2740 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, 2741 struct ras_query_if *info) 2742 { 2743 struct amdgpu_ras_block_object *block_obj; 2744 /* 2745 * Only two block need to query read/write 2746 * RspStatus at current state 2747 */ 2748 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && 2749 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) 2750 return; 2751 2752 block_obj = amdgpu_ras_get_ras_block(adev, 2753 info->head.block, 2754 info->head.sub_block_index); 2755 2756 if (!block_obj || !block_obj->hw_ops) { 2757 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 2758 get_ras_block_str(&info->head)); 2759 return; 2760 } 2761 2762 if (block_obj->hw_ops->query_ras_error_status) 2763 block_obj->hw_ops->query_ras_error_status(adev); 2764 2765 } 2766 2767 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) 2768 { 2769 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2770 struct ras_manager *obj; 2771 2772 if (!adev->ras_enabled || !con) 2773 return; 2774 2775 list_for_each_entry(obj, &con->head, node) { 2776 struct ras_query_if info = { 2777 .head = obj->head, 2778 }; 2779 2780 amdgpu_ras_error_status_query(adev, &info); 2781 } 2782 } 2783 2784 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 2785 struct ras_badpage *bps, uint32_t count, uint32_t start) 2786 { 2787 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2788 struct ras_err_handler_data *data; 2789 int r = 0; 2790 uint32_t i; 2791 2792 if (!con || !con->eh_data || !bps || !count) 2793 return -EINVAL; 2794 2795 mutex_lock(&con->recovery_lock); 2796 data = con->eh_data; 2797 if (start < data->count) { 2798 for (i = start; i < data->count; i++) { 2799 if (!data->bps[i].ts) 2800 continue; 2801 2802 /* U64_MAX is used to mark the record as invalid */ 2803 if (data->bps[i].retired_page == U64_MAX) 2804 continue; 2805 2806 bps[r].bp = data->bps[i].retired_page; 2807 r++; 2808 if (r >= count) 2809 break; 2810 } 2811 } 2812 mutex_unlock(&con->recovery_lock); 2813 2814 return r; 2815 } 2816 2817 static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, 2818 struct ras_badpage *bps, uint32_t count, uint32_t start) 2819 { 2820 struct ras_cmd_bad_pages_info_req cmd_input; 2821 struct ras_cmd_bad_pages_info_rsp *output; 2822 uint32_t group, start_group, end_group; 2823 uint32_t pos, pos_in_group; 2824 int r = 0, i; 2825 2826 if (!bps || !count) 2827 return -EINVAL; 2828 2829 output = kmalloc_obj(*output); 2830 if (!output) 2831 return -ENOMEM; 2832 2833 memset(&cmd_input, 0, sizeof(cmd_input)); 2834 2835 start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2836 end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) / 2837 RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2838 2839 pos = start; 2840 for (group = start_group; group < end_group; group++) { 2841 memset(output, 0, sizeof(*output)); 2842 cmd_input.group_index = group; 2843 if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES, 2844 &cmd_input, sizeof(cmd_input), output, sizeof(*output))) 2845 goto out; 2846 2847 if (pos >= output->bp_total_cnt) 2848 goto out; 2849 2850 pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2851 for (i = pos_in_group; i < output->bp_in_group; i++, pos++) { 2852 if (!output->records[i].ts) 2853 continue; 2854 2855 bps[r].bp = output->records[i].retired_page; 2856 r++; 2857 if (r >= count) 2858 goto out; 2859 } 2860 } 2861 2862 out: 2863 kfree(output); 2864 return r; 2865 } 2866 2867 static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, 2868 struct amdgpu_hive_info *hive, bool status) 2869 { 2870 struct amdgpu_device *tmp_adev; 2871 2872 if (hive) { 2873 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 2874 amdgpu_ras_set_fed(tmp_adev, status); 2875 } else { 2876 amdgpu_ras_set_fed(adev, status); 2877 } 2878 } 2879 2880 bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) 2881 { 2882 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2883 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 2884 int hive_ras_recovery = 0; 2885 2886 if (hive) { 2887 hive_ras_recovery = atomic_read(&hive->ras_recovery); 2888 amdgpu_put_xgmi_hive(hive); 2889 } 2890 2891 if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 2892 return true; 2893 2894 return false; 2895 } 2896 2897 static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev) 2898 { 2899 if (amdgpu_ras_intr_triggered()) 2900 return RAS_EVENT_TYPE_FATAL; 2901 else 2902 return RAS_EVENT_TYPE_POISON_CONSUMPTION; 2903 } 2904 2905 static void amdgpu_ras_do_recovery(struct work_struct *work) 2906 { 2907 struct amdgpu_ras *ras = 2908 container_of(work, struct amdgpu_ras, recovery_work); 2909 struct amdgpu_device *remote_adev = NULL; 2910 struct amdgpu_device *adev = ras->adev; 2911 struct list_head device_list, *device_list_handle = NULL; 2912 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2913 unsigned int error_query_mode; 2914 enum ras_event_type type; 2915 2916 if (hive) { 2917 atomic_set(&hive->ras_recovery, 1); 2918 2919 /* If any device which is part of the hive received RAS fatal 2920 * error interrupt, set fatal error status on all. This 2921 * condition will need a recovery, and flag will be cleared 2922 * as part of recovery. 2923 */ 2924 list_for_each_entry(remote_adev, &hive->device_list, 2925 gmc.xgmi.head) 2926 if (amdgpu_ras_get_fed_status(remote_adev)) { 2927 amdgpu_ras_set_fed_all(adev, hive, true); 2928 break; 2929 } 2930 } 2931 if (!ras->disable_ras_err_cnt_harvest) { 2932 2933 /* Build list of devices to query RAS related errors */ 2934 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { 2935 device_list_handle = &hive->device_list; 2936 } else { 2937 INIT_LIST_HEAD(&device_list); 2938 list_add_tail(&adev->gmc.xgmi.head, &device_list); 2939 device_list_handle = &device_list; 2940 } 2941 2942 if (amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) { 2943 if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) { 2944 /* wait 500ms to ensure pmfw polling mca bank info done */ 2945 msleep(500); 2946 } 2947 } 2948 2949 type = amdgpu_ras_get_fatal_error_event(adev); 2950 list_for_each_entry(remote_adev, 2951 device_list_handle, gmc.xgmi.head) { 2952 if (amdgpu_uniras_enabled(remote_adev)) { 2953 amdgpu_ras_mgr_update_ras_ecc(remote_adev); 2954 } else { 2955 amdgpu_ras_query_err_status(remote_adev); 2956 amdgpu_ras_log_on_err_counter(remote_adev, type); 2957 } 2958 } 2959 2960 } 2961 2962 if (amdgpu_device_should_recover_gpu(ras->adev)) { 2963 struct amdgpu_reset_context reset_context; 2964 memset(&reset_context, 0, sizeof(reset_context)); 2965 2966 reset_context.method = AMD_RESET_METHOD_NONE; 2967 reset_context.reset_req_dev = adev; 2968 reset_context.src = AMDGPU_RESET_SRC_RAS; 2969 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 2970 2971 /* Perform full reset in fatal error mode */ 2972 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) 2973 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2974 else { 2975 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2976 2977 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { 2978 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; 2979 reset_context.method = AMD_RESET_METHOD_MODE2; 2980 } 2981 2982 /* Fatal error occurs in poison mode, mode1 reset is used to 2983 * recover gpu. 2984 */ 2985 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { 2986 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; 2987 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2988 2989 psp_fatal_error_recovery_quirk(&adev->psp); 2990 } 2991 } 2992 2993 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); 2994 } 2995 atomic_set(&ras->in_recovery, 0); 2996 if (hive) { 2997 atomic_set(&hive->ras_recovery, 0); 2998 amdgpu_put_xgmi_hive(hive); 2999 } 3000 } 3001 3002 /* alloc/realloc bps array */ 3003 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 3004 struct ras_err_handler_data *data, int pages) 3005 { 3006 unsigned int old_space = data->count + data->space_left; 3007 unsigned int new_space = old_space + pages; 3008 unsigned int align_space = ALIGN(new_space, 512); 3009 void *bps = kmalloc_objs(*data->bps, align_space); 3010 3011 if (!bps) { 3012 return -ENOMEM; 3013 } 3014 3015 if (data->bps) { 3016 memcpy(bps, data->bps, 3017 data->count * sizeof(*data->bps)); 3018 kfree(data->bps); 3019 } 3020 3021 data->bps = bps; 3022 data->space_left += align_space - old_space; 3023 return 0; 3024 } 3025 3026 static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, 3027 struct eeprom_table_record *bps, 3028 struct ras_err_data *err_data) 3029 { 3030 struct ta_ras_query_address_input addr_in; 3031 uint32_t socket = 0; 3032 int ret = 0; 3033 3034 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 3035 socket = adev->smuio.funcs->get_socket_id(adev); 3036 3037 /* reinit err_data */ 3038 err_data->err_addr_cnt = 0; 3039 err_data->err_addr_len = adev->umc.retire_unit; 3040 3041 memset(&addr_in, 0, sizeof(addr_in)); 3042 addr_in.ma.err_addr = bps->address; 3043 addr_in.ma.socket_id = socket; 3044 addr_in.ma.ch_inst = bps->mem_channel; 3045 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 3046 /* tell RAS TA the node instance is not used */ 3047 addr_in.ma.node_inst = TA_RAS_INV_NODE; 3048 } else { 3049 addr_in.ma.umc_inst = bps->mcumc_id; 3050 addr_in.ma.node_inst = bps->cu; 3051 } 3052 3053 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3054 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, 3055 &addr_in, NULL, false); 3056 3057 return ret; 3058 } 3059 3060 static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, 3061 struct eeprom_table_record *bps, 3062 struct ras_err_data *err_data) 3063 { 3064 struct ta_ras_query_address_input addr_in; 3065 uint32_t die_id, socket = 0; 3066 3067 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 3068 socket = adev->smuio.funcs->get_socket_id(adev); 3069 3070 /* although die id is gotten from PA in nps1 mode, the id is 3071 * fitable for any nps mode 3072 */ 3073 if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) 3074 die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, 3075 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); 3076 else 3077 return -EINVAL; 3078 3079 /* reinit err_data */ 3080 err_data->err_addr_cnt = 0; 3081 err_data->err_addr_len = adev->umc.retire_unit; 3082 3083 memset(&addr_in, 0, sizeof(addr_in)); 3084 addr_in.ma.err_addr = bps->address; 3085 addr_in.ma.ch_inst = bps->mem_channel; 3086 addr_in.ma.umc_inst = bps->mcumc_id; 3087 addr_in.ma.node_inst = die_id; 3088 addr_in.ma.socket_id = socket; 3089 3090 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3091 return adev->umc.ras->convert_ras_err_addr(adev, err_data, 3092 &addr_in, NULL, false); 3093 else 3094 return -EINVAL; 3095 } 3096 3097 static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, 3098 struct eeprom_table_record *bps, int count) 3099 { 3100 int j; 3101 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3102 struct ras_err_handler_data *data = con->eh_data; 3103 3104 for (j = 0; j < count; j++) { 3105 if (!data->space_left && 3106 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 3107 return -ENOMEM; 3108 } 3109 3110 if (amdgpu_ras_check_bad_page_unlock(con, 3111 bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { 3112 /* set to U64_MAX to mark it as invalid */ 3113 data->bps[data->count].retired_page = U64_MAX; 3114 data->count++; 3115 data->space_left--; 3116 continue; 3117 } 3118 3119 amdgpu_ras_reserve_page(adev, bps[j].retired_page); 3120 3121 memcpy(&data->bps[data->count], &(bps[j]), 3122 sizeof(struct eeprom_table_record)); 3123 data->count++; 3124 data->space_left--; 3125 con->bad_page_num++; 3126 } 3127 3128 return 0; 3129 } 3130 3131 static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, 3132 struct eeprom_table_record *bps, struct ras_err_data *err_data, 3133 enum amdgpu_memory_partition nps) 3134 { 3135 int i = 0; 3136 uint64_t chan_idx_v2; 3137 enum amdgpu_memory_partition save_nps; 3138 3139 save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; 3140 chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2; 3141 3142 /*old asics just have pa in eeprom*/ 3143 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { 3144 memcpy(err_data->err_addr, bps, 3145 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); 3146 goto out; 3147 } 3148 3149 for (i = 0; i < adev->umc.retire_unit; i++) 3150 bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); 3151 3152 if (save_nps || chan_idx_v2) { 3153 if (save_nps == nps) { 3154 if (amdgpu_umc_pages_in_a_row(adev, err_data, 3155 bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 3156 return -EINVAL; 3157 for (i = 0; i < adev->umc.retire_unit; i++) { 3158 err_data->err_addr[i].address = bps[0].address; 3159 err_data->err_addr[i].mem_channel = bps[0].mem_channel; 3160 err_data->err_addr[i].bank = bps[0].bank; 3161 err_data->err_addr[i].err_type = bps[0].err_type; 3162 err_data->err_addr[i].mcumc_id = bps[0].mcumc_id; 3163 } 3164 } else { 3165 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) 3166 return -EINVAL; 3167 } 3168 } else { 3169 if (bps[0].address == 0) { 3170 /* for specific old eeprom data, mca address is not stored, 3171 * calc it from pa 3172 */ 3173 if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, 3174 &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) 3175 return -EINVAL; 3176 } 3177 3178 if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { 3179 if (nps == AMDGPU_NPS1_PARTITION_MODE) 3180 memcpy(err_data->err_addr, bps, 3181 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); 3182 else 3183 return -EOPNOTSUPP; 3184 } 3185 } 3186 3187 out: 3188 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); 3189 } 3190 3191 static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, 3192 struct eeprom_table_record *bps, struct ras_err_data *err_data, 3193 enum amdgpu_memory_partition nps) 3194 { 3195 int i = 0; 3196 uint64_t chan_idx_v2; 3197 enum amdgpu_memory_partition save_nps; 3198 3199 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 3200 save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; 3201 chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2; 3202 bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); 3203 } else { 3204 /* if pmfw manages eeprom, save_nps is not stored on eeprom, 3205 * we should always convert mca address into physical address, 3206 * make save_nps different from nps 3207 */ 3208 save_nps = nps + 1; 3209 } 3210 3211 if (save_nps == nps) { 3212 if (amdgpu_umc_pages_in_a_row(adev, err_data, 3213 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) 3214 return -EINVAL; 3215 for (i = 0; i < adev->umc.retire_unit; i++) { 3216 err_data->err_addr[i].address = bps->address; 3217 err_data->err_addr[i].mem_channel = bps->mem_channel; 3218 err_data->err_addr[i].bank = bps->bank; 3219 err_data->err_addr[i].err_type = bps->err_type; 3220 err_data->err_addr[i].mcumc_id = bps->mcumc_id; 3221 } 3222 } else { 3223 if (save_nps || chan_idx_v2) { 3224 if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) 3225 return -EINVAL; 3226 } else { 3227 /* for specific old eeprom data, mca address is not stored, 3228 * calc it from pa 3229 */ 3230 if (bps->address == 0) 3231 if (amdgpu_umc_pa2mca(adev, 3232 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, 3233 &(bps->address), 3234 AMDGPU_NPS1_PARTITION_MODE)) 3235 return -EINVAL; 3236 3237 if (amdgpu_ras_mca2pa(adev, bps, err_data)) 3238 return -EOPNOTSUPP; 3239 } 3240 } 3241 3242 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, 3243 adev->umc.retire_unit); 3244 } 3245 3246 /* it deal with vram only. */ 3247 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 3248 struct eeprom_table_record *bps, int pages, bool from_rom) 3249 { 3250 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3251 struct ras_err_data err_data; 3252 struct amdgpu_ras_eeprom_control *control = 3253 &adev->psp.ras_context.ras->eeprom_control; 3254 enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; 3255 int ret = 0; 3256 uint32_t i = 0; 3257 3258 if (!con || !con->eh_data || !bps || pages <= 0) 3259 return 0; 3260 3261 if (from_rom) { 3262 err_data.err_addr = 3263 kzalloc_objs(struct eeprom_table_record, 3264 adev->umc.retire_unit); 3265 if (!err_data.err_addr) { 3266 dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); 3267 return -ENOMEM; 3268 } 3269 3270 if (adev->gmc.gmc_funcs->query_mem_partition_mode) 3271 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 3272 } 3273 3274 mutex_lock(&con->recovery_lock); 3275 3276 if (from_rom) { 3277 /* there is no pa recs in V3, so skip pa recs processing */ 3278 if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && 3279 !amdgpu_ras_smu_eeprom_supported(adev)) { 3280 for (i = 0; i < pages; i++) { 3281 if (control->ras_num_recs - i >= adev->umc.retire_unit) { 3282 if ((bps[i].address == bps[i + 1].address) && 3283 (bps[i].mem_channel == bps[i + 1].mem_channel)) { 3284 /* deal with retire_unit records a time */ 3285 ret = __amdgpu_ras_convert_rec_array_from_rom(adev, 3286 &bps[i], &err_data, nps); 3287 i += (adev->umc.retire_unit - 1); 3288 } else { 3289 break; 3290 } 3291 } else { 3292 break; 3293 } 3294 } 3295 } 3296 for (; i < pages; i++) { 3297 ret = __amdgpu_ras_convert_rec_from_rom(adev, 3298 &bps[i], &err_data, nps); 3299 } 3300 3301 con->eh_data->count_saved = con->eh_data->count; 3302 } else { 3303 ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); 3304 } 3305 3306 if (from_rom) 3307 kfree(err_data.err_addr); 3308 mutex_unlock(&con->recovery_lock); 3309 3310 return ret; 3311 } 3312 3313 /* 3314 * write error record array to eeprom, the function should be 3315 * protected by recovery_lock 3316 * new_cnt: new added UE count, excluding reserved bad pages, can be NULL 3317 */ 3318 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, 3319 unsigned long *new_cnt) 3320 { 3321 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3322 struct ras_err_handler_data *data; 3323 struct amdgpu_ras_eeprom_control *control; 3324 int save_count, unit_num, i; 3325 3326 if (!con || !con->eh_data) { 3327 if (new_cnt) 3328 *new_cnt = 0; 3329 3330 return 0; 3331 } 3332 3333 if (!con->eeprom_control.is_eeprom_valid) { 3334 dev_warn(adev->dev, 3335 "Failed to save EEPROM table data because of EEPROM data corruption!"); 3336 if (new_cnt) 3337 *new_cnt = 0; 3338 3339 return 0; 3340 } 3341 3342 mutex_lock(&con->recovery_lock); 3343 control = &con->eeprom_control; 3344 data = con->eh_data; 3345 if (amdgpu_ras_smu_eeprom_supported(adev)) 3346 unit_num = control->ras_num_recs - 3347 control->ras_num_recs_old; 3348 else 3349 unit_num = data->count / adev->umc.retire_unit - 3350 control->ras_num_recs; 3351 3352 save_count = con->bad_page_num - control->ras_num_bad_pages; 3353 mutex_unlock(&con->recovery_lock); 3354 3355 if (new_cnt) 3356 *new_cnt = unit_num; 3357 3358 /* only new entries are saved */ 3359 if (unit_num && save_count) { 3360 /*old asics only save pa to eeprom like before*/ 3361 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { 3362 if (amdgpu_ras_eeprom_append(control, 3363 &data->bps[data->count_saved], unit_num)) { 3364 dev_err(adev->dev, "Failed to save EEPROM table data!"); 3365 return -EIO; 3366 } 3367 } else { 3368 for (i = 0; i < unit_num; i++) { 3369 if (amdgpu_ras_eeprom_append(control, 3370 &data->bps[data->count_saved + 3371 i * adev->umc.retire_unit], 1)) { 3372 dev_err(adev->dev, "Failed to save EEPROM table data!"); 3373 return -EIO; 3374 } 3375 } 3376 } 3377 3378 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); 3379 data->count_saved = data->count; 3380 } 3381 3382 return 0; 3383 } 3384 3385 /* 3386 * read error record array in eeprom and reserve enough space for 3387 * storing new bad pages 3388 */ 3389 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 3390 { 3391 struct amdgpu_ras_eeprom_control *control = 3392 &adev->psp.ras_context.ras->eeprom_control; 3393 struct eeprom_table_record *bps; 3394 int ret, i = 0; 3395 3396 /* no bad page record, skip eeprom access */ 3397 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) 3398 return 0; 3399 3400 bps = kzalloc_objs(*bps, control->ras_num_recs); 3401 if (!bps) 3402 return -ENOMEM; 3403 3404 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); 3405 if (ret) { 3406 dev_err(adev->dev, "Failed to load EEPROM table records!"); 3407 } else { 3408 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { 3409 /*In V3, there is no pa recs, and some cases(when address==0) may be parsed 3410 as pa recs, so add verion check to avoid it. 3411 */ 3412 if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && 3413 !amdgpu_ras_smu_eeprom_supported(adev)) { 3414 for (i = 0; i < control->ras_num_recs; i++) { 3415 if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { 3416 if ((bps[i].address == bps[i + 1].address) && 3417 (bps[i].mem_channel == bps[i + 1].mem_channel)) { 3418 control->ras_num_pa_recs += adev->umc.retire_unit; 3419 i += (adev->umc.retire_unit - 1); 3420 } else { 3421 control->ras_num_mca_recs += 3422 (control->ras_num_recs - i); 3423 break; 3424 } 3425 } else { 3426 control->ras_num_mca_recs += (control->ras_num_recs - i); 3427 break; 3428 } 3429 } 3430 } else { 3431 control->ras_num_mca_recs = control->ras_num_recs; 3432 } 3433 } 3434 3435 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); 3436 if (ret) 3437 goto out; 3438 3439 ret = amdgpu_ras_eeprom_check(control); 3440 if (ret) 3441 goto out; 3442 3443 /* HW not usable */ 3444 if (amdgpu_ras_is_rma(adev)) 3445 ret = -EHWPOISON; 3446 } 3447 3448 out: 3449 kfree(bps); 3450 return ret; 3451 } 3452 3453 static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 3454 uint64_t addr) 3455 { 3456 struct ras_err_handler_data *data = con->eh_data; 3457 struct amdgpu_device *adev = con->adev; 3458 int i; 3459 3460 if ((addr >= adev->gmc.mc_vram_size && 3461 adev->gmc.mc_vram_size) || 3462 (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) 3463 return -EINVAL; 3464 3465 addr >>= AMDGPU_GPU_PAGE_SHIFT; 3466 for (i = 0; i < data->count; i++) 3467 if (addr == data->bps[i].retired_page) 3468 return 1; 3469 3470 return 0; 3471 } 3472 3473 /* 3474 * check if an address belongs to bad page 3475 * 3476 * Note: this check is only for umc block 3477 */ 3478 static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 3479 uint64_t addr) 3480 { 3481 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3482 int ret = 0; 3483 3484 if (!con || !con->eh_data) 3485 return ret; 3486 3487 mutex_lock(&con->recovery_lock); 3488 ret = amdgpu_ras_check_bad_page_unlock(con, addr); 3489 mutex_unlock(&con->recovery_lock); 3490 return ret; 3491 } 3492 3493 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, 3494 uint32_t max_count) 3495 { 3496 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3497 3498 /* 3499 * amdgpu_bad_page_threshold is used to config 3500 * the threshold for the number of bad pages. 3501 * -1: Threshold is set to default value 3502 * Driver will issue a warning message when threshold is reached 3503 * and continue runtime services. 3504 * 0: Disable bad page retirement 3505 * Driver will not retire bad pages 3506 * which is intended for debugging purpose. 3507 * -2: Threshold is determined by a formula 3508 * that assumes 1 bad page per 100M of local memory. 3509 * Driver will continue runtime services when threhold is reached. 3510 * 0 < threshold < max number of bad page records in EEPROM, 3511 * A user-defined threshold is set 3512 * Driver will halt runtime services when this custom threshold is reached. 3513 */ 3514 if (amdgpu_bad_page_threshold == -2) { 3515 u64 val = adev->gmc.mc_vram_size; 3516 3517 do_div(val, RAS_BAD_PAGE_COVER); 3518 con->bad_page_cnt_threshold = min(lower_32_bits(val), 3519 max_count); 3520 } else if (amdgpu_bad_page_threshold == -1) { 3521 con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4; 3522 } else { 3523 con->bad_page_cnt_threshold = min_t(int, max_count, 3524 amdgpu_bad_page_threshold); 3525 } 3526 } 3527 3528 int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, 3529 enum amdgpu_ras_block block, uint16_t pasid, 3530 pasid_notify pasid_fn, void *data, uint32_t reset) 3531 { 3532 int ret = 0; 3533 struct ras_poison_msg poison_msg; 3534 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3535 3536 memset(&poison_msg, 0, sizeof(poison_msg)); 3537 poison_msg.block = block; 3538 poison_msg.pasid = pasid; 3539 poison_msg.reset = reset; 3540 poison_msg.pasid_fn = pasid_fn; 3541 poison_msg.data = data; 3542 3543 ret = kfifo_put(&con->poison_fifo, poison_msg); 3544 if (!ret) { 3545 dev_err(adev->dev, "Poison message fifo is full!\n"); 3546 return -ENOSPC; 3547 } 3548 3549 return 0; 3550 } 3551 3552 static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, 3553 struct ras_poison_msg *poison_msg) 3554 { 3555 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3556 3557 return kfifo_get(&con->poison_fifo, poison_msg); 3558 } 3559 3560 static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) 3561 { 3562 mutex_init(&ecc_log->lock); 3563 3564 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); 3565 ecc_log->de_queried_count = 0; 3566 ecc_log->consumption_q_count = 0; 3567 } 3568 3569 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) 3570 { 3571 struct radix_tree_iter iter; 3572 void __rcu **slot; 3573 struct ras_ecc_err *ecc_err; 3574 3575 mutex_lock(&ecc_log->lock); 3576 radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { 3577 ecc_err = radix_tree_deref_slot(slot); 3578 kfree(ecc_err->err_pages.pfn); 3579 kfree(ecc_err); 3580 radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); 3581 } 3582 mutex_unlock(&ecc_log->lock); 3583 3584 mutex_destroy(&ecc_log->lock); 3585 ecc_log->de_queried_count = 0; 3586 ecc_log->consumption_q_count = 0; 3587 } 3588 3589 static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, 3590 uint32_t delayed_ms) 3591 { 3592 int ret; 3593 3594 mutex_lock(&con->umc_ecc_log.lock); 3595 ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, 3596 UMC_ECC_NEW_DETECTED_TAG); 3597 mutex_unlock(&con->umc_ecc_log.lock); 3598 3599 if (ret) 3600 schedule_delayed_work(&con->page_retirement_dwork, 3601 msecs_to_jiffies(delayed_ms)); 3602 3603 return ret ? true : false; 3604 } 3605 3606 static void amdgpu_ras_do_page_retirement(struct work_struct *work) 3607 { 3608 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 3609 page_retirement_dwork.work); 3610 struct amdgpu_device *adev = con->adev; 3611 struct ras_err_data err_data; 3612 3613 /* If gpu reset is ongoing, delay retiring the bad pages */ 3614 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { 3615 amdgpu_ras_schedule_retirement_dwork(con, 3616 AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3); 3617 return; 3618 } 3619 3620 amdgpu_ras_error_data_init(&err_data); 3621 3622 amdgpu_umc_handle_bad_pages(adev, &err_data); 3623 3624 amdgpu_ras_error_data_fini(&err_data); 3625 3626 amdgpu_ras_schedule_retirement_dwork(con, 3627 AMDGPU_RAS_RETIRE_PAGE_INTERVAL); 3628 } 3629 3630 static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, 3631 uint32_t poison_creation_count) 3632 { 3633 int ret = 0; 3634 struct ras_ecc_log_info *ecc_log; 3635 struct ras_query_if info; 3636 u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; 3637 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 3638 u64 de_queried_count; 3639 u64 consumption_q_count; 3640 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 3641 3642 memset(&info, 0, sizeof(info)); 3643 info.head.block = AMDGPU_RAS_BLOCK__UMC; 3644 3645 ecc_log = &ras->umc_ecc_log; 3646 ecc_log->de_queried_count = 0; 3647 ecc_log->consumption_q_count = 0; 3648 3649 do { 3650 ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); 3651 if (ret) 3652 return ret; 3653 3654 de_queried_count = ecc_log->de_queried_count; 3655 consumption_q_count = ecc_log->consumption_q_count; 3656 3657 if (de_queried_count && consumption_q_count) 3658 break; 3659 3660 msleep(100); 3661 } while (--timeout); 3662 3663 if (de_queried_count) 3664 schedule_delayed_work(&ras->page_retirement_dwork, 0); 3665 3666 if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) 3667 amdgpu_ras_reset_gpu(adev); 3668 3669 return 0; 3670 } 3671 3672 static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) 3673 { 3674 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3675 struct ras_poison_msg msg; 3676 int ret; 3677 3678 do { 3679 ret = kfifo_get(&con->poison_fifo, &msg); 3680 } while (ret); 3681 } 3682 3683 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, 3684 uint32_t msg_count, uint32_t *gpu_reset) 3685 { 3686 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3687 uint32_t reset_flags = 0, reset = 0; 3688 struct ras_poison_msg msg; 3689 int ret, i; 3690 3691 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 3692 3693 for (i = 0; i < msg_count; i++) { 3694 ret = amdgpu_ras_get_poison_req(adev, &msg); 3695 if (!ret) 3696 continue; 3697 3698 if (msg.pasid_fn) 3699 msg.pasid_fn(adev, msg.pasid, msg.data); 3700 3701 reset_flags |= msg.reset; 3702 } 3703 3704 /* 3705 * Try to ensure poison creation handler is completed first 3706 * to set rma if bad page exceed threshold. 3707 */ 3708 flush_delayed_work(&con->page_retirement_dwork); 3709 3710 /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ 3711 if (reset_flags && !amdgpu_ras_is_rma(adev)) { 3712 if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) 3713 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 3714 else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) 3715 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 3716 else 3717 reset = reset_flags; 3718 3719 con->gpu_reset_flags |= reset; 3720 amdgpu_ras_reset_gpu(adev); 3721 3722 *gpu_reset = reset; 3723 3724 /* Wait for gpu recovery to complete */ 3725 flush_work(&con->recovery_work); 3726 } 3727 3728 return 0; 3729 } 3730 3731 static int amdgpu_ras_page_retirement_thread(void *param) 3732 { 3733 struct amdgpu_device *adev = (struct amdgpu_device *)param; 3734 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3735 uint32_t poison_creation_count, msg_count; 3736 uint32_t gpu_reset; 3737 int ret; 3738 3739 while (!kthread_should_stop()) { 3740 3741 wait_event_interruptible(con->page_retirement_wq, 3742 kthread_should_stop() || 3743 atomic_read(&con->page_retirement_req_cnt)); 3744 3745 if (kthread_should_stop()) 3746 break; 3747 3748 mutex_lock(&con->poison_lock); 3749 gpu_reset = 0; 3750 3751 do { 3752 poison_creation_count = atomic_read(&con->poison_creation_count); 3753 ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count); 3754 if (ret == -EIO) 3755 break; 3756 3757 if (poison_creation_count) { 3758 atomic_sub(poison_creation_count, &con->poison_creation_count); 3759 atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); 3760 } 3761 } while (atomic_read(&con->poison_creation_count) && 3762 !atomic_read(&con->poison_consumption_count)); 3763 3764 if (ret != -EIO) { 3765 msg_count = kfifo_len(&con->poison_fifo); 3766 if (msg_count) { 3767 ret = amdgpu_ras_poison_consumption_handler(adev, 3768 msg_count, &gpu_reset); 3769 if ((ret != -EIO) && 3770 (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) 3771 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3772 } 3773 } 3774 3775 if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { 3776 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ 3777 /* Clear poison creation request */ 3778 atomic_set(&con->poison_creation_count, 0); 3779 atomic_set(&con->poison_consumption_count, 0); 3780 3781 /* Clear poison fifo */ 3782 amdgpu_ras_clear_poison_fifo(adev); 3783 3784 /* Clear all poison requests */ 3785 atomic_set(&con->page_retirement_req_cnt, 0); 3786 3787 if (ret == -EIO) { 3788 /* Wait for mode-1 reset to complete */ 3789 down_read(&adev->reset_domain->sem); 3790 up_read(&adev->reset_domain->sem); 3791 } 3792 3793 /* Wake up work to save bad pages to eeprom */ 3794 schedule_delayed_work(&con->page_retirement_dwork, 0); 3795 } else if (gpu_reset) { 3796 /* gpu just completed mode-2 reset or other reset */ 3797 /* Clear poison consumption messages cached in fifo */ 3798 msg_count = kfifo_len(&con->poison_fifo); 3799 if (msg_count) { 3800 amdgpu_ras_clear_poison_fifo(adev); 3801 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3802 } 3803 3804 atomic_set(&con->poison_consumption_count, 0); 3805 3806 /* Wake up work to save bad pages to eeprom */ 3807 schedule_delayed_work(&con->page_retirement_dwork, 0); 3808 } 3809 mutex_unlock(&con->poison_lock); 3810 } 3811 3812 return 0; 3813 } 3814 3815 int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) 3816 { 3817 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3818 struct amdgpu_ras_eeprom_control *control; 3819 int ret; 3820 3821 if (!con || amdgpu_sriov_vf(adev)) 3822 return 0; 3823 3824 if (amdgpu_uniras_enabled(adev)) 3825 return 0; 3826 3827 control = &con->eeprom_control; 3828 con->ras_smu_drv = amdgpu_dpm_get_ras_smu_driver(adev); 3829 3830 ret = amdgpu_ras_eeprom_init(control); 3831 control->is_eeprom_valid = !ret; 3832 3833 if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) 3834 control->ras_num_pa_recs = control->ras_num_recs; 3835 3836 if (adev->umc.ras && 3837 adev->umc.ras->get_retire_flip_bits) 3838 adev->umc.ras->get_retire_flip_bits(adev); 3839 3840 if (control->ras_num_recs && control->is_eeprom_valid) { 3841 ret = amdgpu_ras_load_bad_pages(adev); 3842 if (ret) { 3843 control->is_eeprom_valid = false; 3844 return 0; 3845 } 3846 3847 amdgpu_dpm_send_hbm_bad_pages_num( 3848 adev, control->ras_num_bad_pages); 3849 3850 if (con->update_channel_flag == true) { 3851 amdgpu_dpm_send_hbm_bad_channel_flag( 3852 adev, control->bad_channel_bitmap); 3853 con->update_channel_flag = false; 3854 } 3855 3856 /* The format action is only applied to new ASICs */ 3857 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && 3858 control->tbl_hdr.version < RAS_TABLE_VER_V3) 3859 if (!amdgpu_ras_eeprom_reset_table(control)) 3860 if (amdgpu_ras_save_bad_pages(adev, NULL)) 3861 dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); 3862 } 3863 3864 return 0; 3865 } 3866 3867 int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) 3868 { 3869 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3870 struct ras_err_handler_data **data; 3871 u32 max_eeprom_records_count = 0; 3872 int ret; 3873 3874 if (!con || amdgpu_sriov_vf(adev)) 3875 return 0; 3876 3877 /* Allow access to RAS EEPROM via debugfs, when the ASIC 3878 * supports RAS and debugfs is enabled, but when 3879 * adev->ras_enabled is unset, i.e. when "ras_enable" 3880 * module parameter is set to 0. 3881 */ 3882 con->adev = adev; 3883 3884 if (!adev->ras_enabled) 3885 return 0; 3886 3887 data = &con->eh_data; 3888 *data = kzalloc_obj(**data); 3889 if (!*data) { 3890 ret = -ENOMEM; 3891 goto out; 3892 } 3893 3894 mutex_init(&con->recovery_lock); 3895 mutex_init(&con->poison_lock); 3896 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 3897 atomic_set(&con->in_recovery, 0); 3898 atomic_set(&con->rma_in_recovery, 0); 3899 con->eeprom_control.bad_channel_bitmap = 0; 3900 3901 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); 3902 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); 3903 3904 if (init_bp_info) { 3905 ret = amdgpu_ras_init_badpage_info(adev); 3906 if (ret) 3907 goto free; 3908 } 3909 3910 mutex_init(&con->page_rsv_lock); 3911 INIT_KFIFO(con->poison_fifo); 3912 mutex_init(&con->page_retirement_lock); 3913 init_waitqueue_head(&con->page_retirement_wq); 3914 atomic_set(&con->page_retirement_req_cnt, 0); 3915 atomic_set(&con->poison_creation_count, 0); 3916 atomic_set(&con->poison_consumption_count, 0); 3917 con->page_retirement_thread = 3918 kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); 3919 if (IS_ERR(con->page_retirement_thread)) { 3920 con->page_retirement_thread = NULL; 3921 dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); 3922 } 3923 3924 INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); 3925 amdgpu_ras_ecc_log_init(&con->umc_ecc_log); 3926 #ifdef CONFIG_X86_MCE_AMD 3927 if ((adev->asic_type == CHIP_ALDEBARAN) && 3928 (adev->gmc.xgmi.connected_to_cpu)) 3929 amdgpu_register_bad_pages_mca_notifier(adev); 3930 #endif 3931 return 0; 3932 3933 free: 3934 kfree((*data)->bps); 3935 kfree(*data); 3936 con->eh_data = NULL; 3937 out: 3938 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); 3939 3940 /* 3941 * Except error threshold exceeding case, other failure cases in this 3942 * function would not fail amdgpu driver init. 3943 */ 3944 if (!amdgpu_ras_is_rma(adev)) 3945 ret = 0; 3946 else 3947 ret = -EINVAL; 3948 3949 return ret; 3950 } 3951 3952 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 3953 { 3954 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3955 struct ras_err_handler_data *data = con->eh_data; 3956 int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES; 3957 bool ret; 3958 3959 /* recovery_init failed to init it, fini is useless */ 3960 if (!data) 3961 return 0; 3962 3963 /* Save all cached bad pages to eeprom */ 3964 do { 3965 flush_delayed_work(&con->page_retirement_dwork); 3966 ret = amdgpu_ras_schedule_retirement_dwork(con, 0); 3967 } while (ret && max_flush_timeout--); 3968 3969 if (con->page_retirement_thread) 3970 kthread_stop(con->page_retirement_thread); 3971 3972 atomic_set(&con->page_retirement_req_cnt, 0); 3973 atomic_set(&con->poison_creation_count, 0); 3974 3975 mutex_destroy(&con->page_rsv_lock); 3976 3977 cancel_work_sync(&con->recovery_work); 3978 3979 cancel_delayed_work_sync(&con->page_retirement_dwork); 3980 3981 amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); 3982 3983 mutex_lock(&con->recovery_lock); 3984 con->eh_data = NULL; 3985 kfree(data->bps); 3986 kfree(data); 3987 mutex_unlock(&con->recovery_lock); 3988 3989 amdgpu_ras_critical_region_init(adev); 3990 #ifdef CONFIG_X86_MCE_AMD 3991 amdgpu_unregister_bad_pages_mca_notifier(adev); 3992 #endif 3993 return 0; 3994 } 3995 /* recovery end */ 3996 3997 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) 3998 { 3999 if (amdgpu_sriov_vf(adev)) { 4000 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4001 case IP_VERSION(13, 0, 2): 4002 case IP_VERSION(13, 0, 6): 4003 case IP_VERSION(13, 0, 12): 4004 case IP_VERSION(13, 0, 14): 4005 case IP_VERSION(13, 0, 15): 4006 return true; 4007 default: 4008 return false; 4009 } 4010 } 4011 4012 if (adev->asic_type == CHIP_IP_DISCOVERY) { 4013 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4014 case IP_VERSION(13, 0, 0): 4015 case IP_VERSION(13, 0, 6): 4016 case IP_VERSION(13, 0, 10): 4017 case IP_VERSION(13, 0, 12): 4018 case IP_VERSION(13, 0, 14): 4019 case IP_VERSION(13, 0, 15): 4020 case IP_VERSION(14, 0, 3): 4021 return true; 4022 default: 4023 return false; 4024 } 4025 } 4026 4027 return adev->asic_type == CHIP_VEGA10 || 4028 adev->asic_type == CHIP_VEGA20 || 4029 adev->asic_type == CHIP_ARCTURUS || 4030 adev->asic_type == CHIP_ALDEBARAN || 4031 adev->asic_type == CHIP_SIENNA_CICHLID; 4032 } 4033 4034 /* 4035 * this is workaround for vega20 workstation sku, 4036 * force enable gfx ras, ignore vbios gfx ras flag 4037 * due to GC EDC can not write 4038 */ 4039 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) 4040 { 4041 struct atom_context *ctx = adev->mode_info.atom_context; 4042 4043 if (!ctx) 4044 return; 4045 4046 if (strnstr(ctx->vbios_pn, "D16406", 4047 sizeof(ctx->vbios_pn)) || 4048 strnstr(ctx->vbios_pn, "D36002", 4049 sizeof(ctx->vbios_pn))) 4050 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); 4051 } 4052 4053 /* Query ras capablity via atomfirmware interface */ 4054 static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) 4055 { 4056 /* mem_ecc cap */ 4057 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { 4058 dev_info(adev->dev, "MEM ECC is active.\n"); 4059 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | 4060 1 << AMDGPU_RAS_BLOCK__DF); 4061 } else { 4062 dev_info(adev->dev, "MEM ECC is not presented.\n"); 4063 } 4064 4065 /* sram_ecc cap */ 4066 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { 4067 dev_info(adev->dev, "SRAM ECC is active.\n"); 4068 if (!amdgpu_sriov_vf(adev)) 4069 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 4070 1 << AMDGPU_RAS_BLOCK__DF); 4071 else 4072 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | 4073 1 << AMDGPU_RAS_BLOCK__SDMA | 4074 1 << AMDGPU_RAS_BLOCK__GFX); 4075 4076 /* 4077 * VCN/JPEG RAS can be supported on both bare metal and 4078 * SRIOV environment 4079 */ 4080 if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || 4081 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || 4082 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || 4083 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1)) 4084 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 4085 1 << AMDGPU_RAS_BLOCK__JPEG); 4086 else 4087 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | 4088 1 << AMDGPU_RAS_BLOCK__JPEG); 4089 4090 /* 4091 * XGMI RAS is not supported if xgmi num physical nodes 4092 * is zero 4093 */ 4094 if (!adev->gmc.xgmi.num_physical_nodes) 4095 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); 4096 } else { 4097 dev_info(adev->dev, "SRAM ECC is not presented.\n"); 4098 } 4099 } 4100 4101 /* Query poison mode from umc/df IP callbacks */ 4102 static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) 4103 { 4104 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4105 bool df_poison, umc_poison; 4106 4107 /* poison setting is useless on SRIOV guest */ 4108 if (amdgpu_sriov_vf(adev) || !con) 4109 return; 4110 4111 /* Init poison supported flag, the default value is false */ 4112 if (adev->gmc.xgmi.connected_to_cpu || 4113 adev->gmc.is_app_apu) { 4114 /* enabled by default when GPU is connected to CPU */ 4115 con->poison_supported = true; 4116 } else if (adev->df.funcs && 4117 adev->df.funcs->query_ras_poison_mode && 4118 adev->umc.ras && 4119 adev->umc.ras->query_ras_poison_mode) { 4120 df_poison = 4121 adev->df.funcs->query_ras_poison_mode(adev); 4122 umc_poison = 4123 adev->umc.ras->query_ras_poison_mode(adev); 4124 4125 /* Only poison is set in both DF and UMC, we can support it */ 4126 if (df_poison && umc_poison) 4127 con->poison_supported = true; 4128 else if (df_poison != umc_poison) 4129 dev_warn(adev->dev, 4130 "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", 4131 df_poison, umc_poison); 4132 } 4133 } 4134 4135 /* 4136 * check hardware's ras ability which will be saved in hw_supported. 4137 * if hardware does not support ras, we can skip some ras initializtion and 4138 * forbid some ras operations from IP. 4139 * if software itself, say boot parameter, limit the ras ability. We still 4140 * need allow IP do some limited operations, like disable. In such case, 4141 * we have to initialize ras as normal. but need check if operation is 4142 * allowed or not in each function. 4143 */ 4144 static void amdgpu_ras_check_supported(struct amdgpu_device *adev) 4145 { 4146 adev->ras_hw_enabled = adev->ras_enabled = 0; 4147 4148 if (!amdgpu_ras_asic_supported(adev)) 4149 return; 4150 4151 if (amdgpu_sriov_vf(adev)) { 4152 if (amdgpu_virt_get_ras_capability(adev)) 4153 goto init_ras_enabled_flag; 4154 } 4155 4156 /* query ras capability from psp */ 4157 if (amdgpu_psp_get_ras_capability(&adev->psp)) 4158 goto init_ras_enabled_flag; 4159 4160 /* query ras capablity from bios */ 4161 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4162 amdgpu_ras_query_ras_capablity_from_vbios(adev); 4163 } else { 4164 /* driver only manages a few IP blocks RAS feature 4165 * when GPU is connected cpu through XGMI */ 4166 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | 4167 1 << AMDGPU_RAS_BLOCK__SDMA | 4168 1 << AMDGPU_RAS_BLOCK__MMHUB); 4169 } 4170 4171 /* apply asic specific settings (vega20 only for now) */ 4172 amdgpu_ras_get_quirks(adev); 4173 4174 /* query poison mode from umc/df ip callback */ 4175 amdgpu_ras_query_poison_mode(adev); 4176 4177 init_ras_enabled_flag: 4178 /* hw_supported needs to be aligned with RAS block mask. */ 4179 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; 4180 4181 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : 4182 adev->ras_hw_enabled & amdgpu_ras_mask; 4183 4184 /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ 4185 if (!amdgpu_sriov_vf(adev)) { 4186 adev->aca.is_enabled = 4187 (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || 4188 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || 4189 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14) || 4190 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 15)); 4191 } 4192 4193 /* bad page feature is not applicable to specific app platform */ 4194 if (adev->gmc.is_app_apu && 4195 amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) 4196 amdgpu_bad_page_threshold = 0; 4197 } 4198 4199 static void amdgpu_ras_counte_dw(struct work_struct *work) 4200 { 4201 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 4202 ras_counte_delay_work.work); 4203 struct amdgpu_device *adev = con->adev; 4204 struct drm_device *dev = adev_to_drm(adev); 4205 unsigned long ce_count, ue_count; 4206 int res; 4207 4208 res = pm_runtime_get_sync(dev->dev); 4209 if (res < 0) 4210 goto Out; 4211 4212 /* Cache new values. 4213 */ 4214 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { 4215 atomic_set(&con->ras_ce_count, ce_count); 4216 atomic_set(&con->ras_ue_count, ue_count); 4217 } 4218 4219 Out: 4220 pm_runtime_put_autosuspend(dev->dev); 4221 } 4222 4223 static int amdgpu_get_ras_schema(struct amdgpu_device *adev) 4224 { 4225 return amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 | 4226 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE | 4227 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE | 4228 AMDGPU_RAS_ERROR__PARITY; 4229 } 4230 4231 static void ras_event_mgr_init(struct ras_event_manager *mgr) 4232 { 4233 struct ras_event_state *event_state; 4234 int i; 4235 4236 memset(mgr, 0, sizeof(*mgr)); 4237 atomic64_set(&mgr->seqno, 0); 4238 4239 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { 4240 event_state = &mgr->event_state[i]; 4241 event_state->last_seqno = RAS_EVENT_INVALID_ID; 4242 atomic64_set(&event_state->count, 0); 4243 } 4244 } 4245 4246 static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) 4247 { 4248 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4249 struct amdgpu_hive_info *hive; 4250 4251 if (!ras) 4252 return; 4253 4254 hive = amdgpu_get_xgmi_hive(adev); 4255 ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; 4256 4257 /* init event manager with node 0 on xgmi system */ 4258 if (!amdgpu_reset_in_recovery(adev)) { 4259 if (!hive || adev->gmc.xgmi.node_id == 0) 4260 ras_event_mgr_init(ras->event_mgr); 4261 } 4262 4263 if (hive) 4264 amdgpu_put_xgmi_hive(hive); 4265 } 4266 4267 static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) 4268 { 4269 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4270 4271 if (!con || (adev->flags & AMD_IS_APU)) 4272 return; 4273 4274 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4275 case IP_VERSION(13, 0, 2): 4276 case IP_VERSION(13, 0, 6): 4277 case IP_VERSION(13, 0, 12): 4278 case IP_VERSION(13, 0, 15): 4279 con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; 4280 break; 4281 case IP_VERSION(13, 0, 14): 4282 con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1); 4283 break; 4284 default: 4285 break; 4286 } 4287 } 4288 4289 int amdgpu_ras_init(struct amdgpu_device *adev) 4290 { 4291 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4292 int r; 4293 4294 if (con) 4295 return 0; 4296 4297 con = kzalloc(sizeof(*con) + 4298 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT + 4299 sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, 4300 GFP_KERNEL); 4301 if (!con) 4302 return -ENOMEM; 4303 4304 con->adev = adev; 4305 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); 4306 atomic_set(&con->ras_ce_count, 0); 4307 atomic_set(&con->ras_ue_count, 0); 4308 4309 con->objs = (struct ras_manager *)(con + 1); 4310 4311 amdgpu_ras_set_context(adev, con); 4312 4313 amdgpu_ras_check_supported(adev); 4314 4315 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { 4316 /* set gfx block ras context feature for VEGA20 Gaming 4317 * send ras disable cmd to ras ta during ras late init. 4318 */ 4319 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { 4320 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); 4321 4322 return 0; 4323 } 4324 4325 r = 0; 4326 goto release_con; 4327 } 4328 4329 con->update_channel_flag = false; 4330 con->features = 0; 4331 con->schema = 0; 4332 INIT_LIST_HEAD(&con->head); 4333 /* Might need get this flag from vbios. */ 4334 con->flags = RAS_DEFAULT_FLAGS; 4335 4336 /* initialize nbio ras function ahead of any other 4337 * ras functions so hardware fatal error interrupt 4338 * can be enabled as early as possible */ 4339 switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { 4340 case IP_VERSION(7, 4, 0): 4341 case IP_VERSION(7, 4, 1): 4342 case IP_VERSION(7, 4, 4): 4343 if (!adev->gmc.xgmi.connected_to_cpu) 4344 adev->nbio.ras = &nbio_v7_4_ras; 4345 break; 4346 case IP_VERSION(4, 3, 0): 4347 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 4348 /* unlike other generation of nbio ras, 4349 * nbio v4_3 only support fatal error interrupt 4350 * to inform software that DF is freezed due to 4351 * system fatal error event. driver should not 4352 * enable nbio ras in such case. Instead, 4353 * check DF RAS */ 4354 adev->nbio.ras = &nbio_v4_3_ras; 4355 break; 4356 case IP_VERSION(6, 3, 1): 4357 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 4358 /* unlike other generation of nbio ras, 4359 * nbif v6_3_1 only support fatal error interrupt 4360 * to inform software that DF is freezed due to 4361 * system fatal error event. driver should not 4362 * enable nbio ras in such case. Instead, 4363 * check DF RAS 4364 */ 4365 adev->nbio.ras = &nbif_v6_3_1_ras; 4366 break; 4367 case IP_VERSION(7, 9, 0): 4368 case IP_VERSION(7, 9, 1): 4369 if (!adev->gmc.is_app_apu) 4370 adev->nbio.ras = &nbio_v7_9_ras; 4371 break; 4372 default: 4373 /* nbio ras is not available */ 4374 break; 4375 } 4376 4377 /* nbio ras block needs to be enabled ahead of other ras blocks 4378 * to handle fatal error */ 4379 r = amdgpu_nbio_ras_sw_init(adev); 4380 if (r) 4381 goto release_con; 4382 4383 if (adev->nbio.ras && 4384 adev->nbio.ras->init_ras_controller_interrupt) { 4385 r = adev->nbio.ras->init_ras_controller_interrupt(adev); 4386 if (r) 4387 goto release_con; 4388 } 4389 4390 if (adev->nbio.ras && 4391 adev->nbio.ras->init_ras_err_event_athub_interrupt) { 4392 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); 4393 if (r) 4394 goto release_con; 4395 } 4396 4397 /* Packed socket_id to ras feature mask bits[31:29] */ 4398 if (adev->smuio.funcs && 4399 adev->smuio.funcs->get_socket_id) 4400 con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 4401 AMDGPU_RAS_FEATURES_SOCKETID_SHIFT); 4402 4403 /* Get RAS schema for particular SOC */ 4404 con->schema = amdgpu_get_ras_schema(adev); 4405 4406 amdgpu_ras_init_reserved_vram_size(adev); 4407 4408 if (amdgpu_ras_fs_init(adev)) { 4409 r = -EINVAL; 4410 goto release_con; 4411 } 4412 4413 if (amdgpu_ras_aca_is_supported(adev)) { 4414 if (amdgpu_aca_is_enabled(adev)) 4415 r = amdgpu_aca_init(adev); 4416 else 4417 r = amdgpu_mca_init(adev); 4418 if (r) 4419 goto release_con; 4420 } 4421 4422 con->init_task_pid = task_pid_nr(current); 4423 get_task_comm(con->init_task_comm, current); 4424 4425 mutex_init(&con->critical_region_lock); 4426 INIT_LIST_HEAD(&con->critical_region_head); 4427 4428 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " 4429 "hardware ability[%x] ras_mask[%x]\n", 4430 adev->ras_hw_enabled, adev->ras_enabled); 4431 4432 return 0; 4433 release_con: 4434 amdgpu_ras_set_context(adev, NULL); 4435 kfree(con); 4436 4437 return r; 4438 } 4439 4440 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) 4441 { 4442 if (adev->gmc.xgmi.connected_to_cpu || 4443 adev->gmc.is_app_apu) 4444 return 1; 4445 return 0; 4446 } 4447 4448 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, 4449 struct ras_common_if *ras_block) 4450 { 4451 struct ras_query_if info = { 4452 .head = *ras_block, 4453 }; 4454 4455 if (!amdgpu_persistent_edc_harvesting_supported(adev)) 4456 return 0; 4457 4458 if (amdgpu_ras_query_error_status(adev, &info) != 0) 4459 drm_warn(adev_to_drm(adev), "RAS init query failure"); 4460 4461 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) 4462 drm_warn(adev_to_drm(adev), "RAS init harvest reset failure"); 4463 4464 return 0; 4465 } 4466 4467 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) 4468 { 4469 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4470 4471 if (!con) 4472 return false; 4473 4474 return con->poison_supported; 4475 } 4476 4477 /* helper function to handle common stuff in ip late init phase */ 4478 int amdgpu_ras_block_late_init(struct amdgpu_device *adev, 4479 struct ras_common_if *ras_block) 4480 { 4481 struct amdgpu_ras_block_object *ras_obj = NULL; 4482 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4483 struct ras_query_if *query_info; 4484 unsigned long ue_count, ce_count; 4485 int r; 4486 4487 /* disable RAS feature per IP block if it is not supported */ 4488 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { 4489 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); 4490 return 0; 4491 } 4492 4493 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); 4494 if (r) { 4495 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) { 4496 /* in resume phase, if fail to enable ras, 4497 * clean up all ras fs nodes, and disable ras */ 4498 goto cleanup; 4499 } else 4500 return r; 4501 } 4502 4503 /* check for errors on warm reset edc persisant supported ASIC */ 4504 amdgpu_persistent_edc_harvesting(adev, ras_block); 4505 4506 /* in resume phase, no need to create ras fs node */ 4507 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) 4508 return 0; 4509 4510 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 4511 if (ras_obj->ras_cb || (ras_obj->hw_ops && 4512 (ras_obj->hw_ops->query_poison_status || 4513 ras_obj->hw_ops->handle_poison_consumption))) { 4514 r = amdgpu_ras_interrupt_add_handler(adev, ras_block); 4515 if (r) 4516 goto cleanup; 4517 } 4518 4519 if (ras_obj->hw_ops && 4520 (ras_obj->hw_ops->query_ras_error_count || 4521 ras_obj->hw_ops->query_ras_error_status)) { 4522 r = amdgpu_ras_sysfs_create(adev, ras_block); 4523 if (r) 4524 goto interrupt; 4525 4526 /* Those are the cached values at init. 4527 */ 4528 query_info = kzalloc_obj(*query_info); 4529 if (!query_info) 4530 return -ENOMEM; 4531 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); 4532 4533 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { 4534 atomic_set(&con->ras_ce_count, ce_count); 4535 atomic_set(&con->ras_ue_count, ue_count); 4536 } 4537 4538 kfree(query_info); 4539 } 4540 4541 return 0; 4542 4543 interrupt: 4544 if (ras_obj->ras_cb) 4545 amdgpu_ras_interrupt_remove_handler(adev, ras_block); 4546 cleanup: 4547 amdgpu_ras_feature_enable(adev, ras_block, 0); 4548 return r; 4549 } 4550 4551 static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, 4552 struct ras_common_if *ras_block) 4553 { 4554 return amdgpu_ras_block_late_init(adev, ras_block); 4555 } 4556 4557 /* helper function to remove ras fs node and interrupt handler */ 4558 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, 4559 struct ras_common_if *ras_block) 4560 { 4561 struct amdgpu_ras_block_object *ras_obj; 4562 if (!ras_block) 4563 return; 4564 4565 amdgpu_ras_sysfs_remove(adev, ras_block); 4566 4567 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 4568 if (ras_obj->ras_cb) 4569 amdgpu_ras_interrupt_remove_handler(adev, ras_block); 4570 } 4571 4572 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, 4573 struct ras_common_if *ras_block) 4574 { 4575 return amdgpu_ras_block_late_fini(adev, ras_block); 4576 } 4577 4578 /* do some init work after IP late init as dependence. 4579 * and it runs in resume/gpu reset/booting up cases. 4580 */ 4581 void amdgpu_ras_resume(struct amdgpu_device *adev) 4582 { 4583 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4584 struct ras_manager *obj, *tmp; 4585 4586 if (!adev->ras_enabled || !con) { 4587 /* clean ras context for VEGA20 Gaming after send ras disable cmd */ 4588 amdgpu_release_ras_context(adev); 4589 4590 return; 4591 } 4592 4593 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 4594 /* Set up all other IPs which are not implemented. There is a 4595 * tricky thing that IP's actual ras error type should be 4596 * MULTI_UNCORRECTABLE, but as driver does not handle it, so 4597 * ERROR_NONE make sense anyway. 4598 */ 4599 amdgpu_ras_enable_all_features(adev, 1); 4600 4601 /* We enable ras on all hw_supported block, but as boot 4602 * parameter might disable some of them and one or more IP has 4603 * not implemented yet. So we disable them on behalf. 4604 */ 4605 list_for_each_entry_safe(obj, tmp, &con->head, node) { 4606 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 4607 amdgpu_ras_feature_enable(adev, &obj->head, 0); 4608 /* there should be no any reference. */ 4609 WARN_ON(alive_obj(obj)); 4610 } 4611 } 4612 } 4613 } 4614 4615 void amdgpu_ras_suspend(struct amdgpu_device *adev) 4616 { 4617 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4618 4619 if (!adev->ras_enabled || !con) 4620 return; 4621 4622 amdgpu_ras_disable_all_features(adev, 0); 4623 /* Make sure all ras objects are disabled. */ 4624 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4625 amdgpu_ras_disable_all_features(adev, 1); 4626 } 4627 4628 int amdgpu_ras_late_init(struct amdgpu_device *adev) 4629 { 4630 struct amdgpu_ras_block_list *node, *tmp; 4631 struct amdgpu_ras_block_object *obj; 4632 int r; 4633 4634 amdgpu_ras_event_mgr_init(adev); 4635 4636 if (amdgpu_ras_aca_is_supported(adev)) { 4637 if (amdgpu_reset_in_recovery(adev)) { 4638 if (amdgpu_aca_is_enabled(adev)) 4639 r = amdgpu_aca_reset(adev); 4640 else 4641 r = amdgpu_mca_reset(adev); 4642 if (r) 4643 return r; 4644 } 4645 4646 if (!amdgpu_sriov_vf(adev)) { 4647 if (amdgpu_aca_is_enabled(adev)) 4648 amdgpu_ras_set_aca_debug_mode(adev, false); 4649 else 4650 amdgpu_ras_set_mca_debug_mode(adev, false); 4651 } 4652 } 4653 4654 /* Guest side doesn't need init ras feature */ 4655 if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) 4656 return 0; 4657 4658 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 4659 obj = node->ras_obj; 4660 if (!obj) { 4661 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 4662 continue; 4663 } 4664 4665 if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) 4666 continue; 4667 4668 if (obj->ras_late_init) { 4669 r = obj->ras_late_init(adev, &obj->ras_comm); 4670 if (r) { 4671 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", 4672 obj->ras_comm.name, r); 4673 return r; 4674 } 4675 } else 4676 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); 4677 } 4678 4679 amdgpu_ras_check_bad_page_status(adev); 4680 4681 return 0; 4682 } 4683 4684 /* do some fini work before IP fini as dependence */ 4685 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 4686 { 4687 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4688 4689 if (!adev->ras_enabled || !con) 4690 return 0; 4691 4692 4693 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 4694 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4695 amdgpu_ras_disable_all_features(adev, 0); 4696 amdgpu_ras_recovery_fini(adev); 4697 return 0; 4698 } 4699 4700 int amdgpu_ras_fini(struct amdgpu_device *adev) 4701 { 4702 struct amdgpu_ras_block_list *ras_node, *tmp; 4703 struct amdgpu_ras_block_object *obj = NULL; 4704 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4705 4706 if (!adev->ras_enabled || !con) 4707 return 0; 4708 4709 amdgpu_ras_critical_region_fini(adev); 4710 mutex_destroy(&con->critical_region_lock); 4711 4712 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { 4713 if (ras_node->ras_obj) { 4714 obj = ras_node->ras_obj; 4715 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && 4716 obj->ras_fini) 4717 obj->ras_fini(adev, &obj->ras_comm); 4718 else 4719 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); 4720 } 4721 4722 /* Clear ras blocks from ras_list and free ras block list node */ 4723 list_del(&ras_node->node); 4724 kfree(ras_node); 4725 } 4726 4727 amdgpu_ras_fs_fini(adev); 4728 amdgpu_ras_interrupt_remove_all(adev); 4729 4730 if (amdgpu_ras_aca_is_supported(adev)) { 4731 if (amdgpu_aca_is_enabled(adev)) 4732 amdgpu_aca_fini(adev); 4733 else 4734 amdgpu_mca_fini(adev); 4735 } 4736 4737 WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); 4738 4739 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4740 amdgpu_ras_disable_all_features(adev, 0); 4741 4742 cancel_delayed_work_sync(&con->ras_counte_delay_work); 4743 4744 amdgpu_ras_set_context(adev, NULL); 4745 kfree(con); 4746 4747 return 0; 4748 } 4749 4750 bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) 4751 { 4752 struct amdgpu_ras *ras; 4753 4754 ras = amdgpu_ras_get_context(adev); 4755 if (!ras) 4756 return false; 4757 4758 return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4759 } 4760 4761 void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) 4762 { 4763 struct amdgpu_ras *ras; 4764 4765 ras = amdgpu_ras_get_context(adev); 4766 if (ras) { 4767 if (status) 4768 set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4769 else 4770 clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4771 } 4772 } 4773 4774 void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) 4775 { 4776 struct amdgpu_ras *ras; 4777 4778 ras = amdgpu_ras_get_context(adev); 4779 if (ras) { 4780 ras->ras_err_state = 0; 4781 ras->gpu_reset_flags = 0; 4782 } 4783 } 4784 4785 void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, 4786 enum amdgpu_ras_block block) 4787 { 4788 struct amdgpu_ras *ras; 4789 4790 ras = amdgpu_ras_get_context(adev); 4791 if (ras) 4792 set_bit(block, &ras->ras_err_state); 4793 } 4794 4795 bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) 4796 { 4797 struct amdgpu_ras *ras; 4798 4799 ras = amdgpu_ras_get_context(adev); 4800 if (ras) { 4801 if (block == AMDGPU_RAS_BLOCK__ANY) 4802 return (ras->ras_err_state != 0); 4803 else 4804 return test_bit(block, &ras->ras_err_state) || 4805 test_bit(AMDGPU_RAS_BLOCK__LAST, 4806 &ras->ras_err_state); 4807 } 4808 4809 return false; 4810 } 4811 4812 static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) 4813 { 4814 struct amdgpu_ras *ras; 4815 4816 ras = amdgpu_ras_get_context(adev); 4817 if (!ras) 4818 return NULL; 4819 4820 return ras->event_mgr; 4821 } 4822 4823 int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, 4824 const void *caller) 4825 { 4826 struct ras_event_manager *event_mgr; 4827 struct ras_event_state *event_state; 4828 int ret = 0; 4829 4830 if (amdgpu_uniras_enabled(adev)) 4831 return 0; 4832 4833 if (type >= RAS_EVENT_TYPE_COUNT) { 4834 ret = -EINVAL; 4835 goto out; 4836 } 4837 4838 event_mgr = __get_ras_event_mgr(adev); 4839 if (!event_mgr) { 4840 ret = -EINVAL; 4841 goto out; 4842 } 4843 4844 event_state = &event_mgr->event_state[type]; 4845 event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); 4846 atomic64_inc(&event_state->count); 4847 4848 out: 4849 if (ret && caller) 4850 dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", 4851 (int)type, caller, ret); 4852 4853 return ret; 4854 } 4855 4856 u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) 4857 { 4858 struct ras_event_manager *event_mgr; 4859 u64 id; 4860 4861 if (type >= RAS_EVENT_TYPE_COUNT) 4862 return RAS_EVENT_INVALID_ID; 4863 4864 switch (type) { 4865 case RAS_EVENT_TYPE_FATAL: 4866 case RAS_EVENT_TYPE_POISON_CREATION: 4867 case RAS_EVENT_TYPE_POISON_CONSUMPTION: 4868 event_mgr = __get_ras_event_mgr(adev); 4869 if (!event_mgr) 4870 return RAS_EVENT_INVALID_ID; 4871 4872 id = event_mgr->event_state[type].last_seqno; 4873 break; 4874 case RAS_EVENT_TYPE_INVALID: 4875 default: 4876 id = RAS_EVENT_INVALID_ID; 4877 break; 4878 } 4879 4880 return id; 4881 } 4882 4883 int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) 4884 { 4885 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { 4886 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4887 enum ras_event_type type = RAS_EVENT_TYPE_FATAL; 4888 u64 event_id = RAS_EVENT_INVALID_ID; 4889 4890 if (amdgpu_uniras_enabled(adev)) 4891 return 0; 4892 4893 if (!amdgpu_ras_mark_ras_event(adev, type)) 4894 event_id = amdgpu_ras_acquire_event_id(adev, type); 4895 4896 RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" 4897 "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); 4898 4899 amdgpu_ras_set_fed(adev, true); 4900 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 4901 amdgpu_ras_reset_gpu(adev); 4902 } 4903 4904 return -EBUSY; 4905 } 4906 4907 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) 4908 { 4909 if (adev->asic_type == CHIP_VEGA20 && 4910 adev->pm.fw_version <= 0x283400) { 4911 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && 4912 amdgpu_ras_intr_triggered(); 4913 } 4914 4915 return false; 4916 } 4917 4918 void amdgpu_release_ras_context(struct amdgpu_device *adev) 4919 { 4920 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4921 4922 if (!con) 4923 return; 4924 4925 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { 4926 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); 4927 amdgpu_ras_set_context(adev, NULL); 4928 kfree(con); 4929 } 4930 } 4931 4932 #ifdef CONFIG_X86_MCE_AMD 4933 static struct amdgpu_device *find_adev(uint32_t node_id) 4934 { 4935 int i; 4936 struct amdgpu_device *adev = NULL; 4937 4938 for (i = 0; i < mce_adev_list.num_gpu; i++) { 4939 adev = mce_adev_list.devs[i]; 4940 4941 if (adev && adev->gmc.xgmi.connected_to_cpu && 4942 adev->gmc.xgmi.physical_node_id == node_id) 4943 break; 4944 adev = NULL; 4945 } 4946 4947 return adev; 4948 } 4949 4950 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) 4951 #define GET_UMC_INST(m) (((m) >> 21) & 0x7) 4952 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4)) 4953 #define GPU_ID_OFFSET 8 4954 4955 static int amdgpu_bad_page_notifier(struct notifier_block *nb, 4956 unsigned long val, void *data) 4957 { 4958 struct mce *m = (struct mce *)data; 4959 struct amdgpu_device *adev = NULL; 4960 uint32_t gpu_id = 0; 4961 uint32_t umc_inst = 0, ch_inst = 0; 4962 4963 /* 4964 * If the error was generated in UMC_V2, which belongs to GPU UMCs, 4965 * and error occurred in DramECC (Extended error code = 0) then only 4966 * process the error, else bail out. 4967 */ 4968 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && 4969 (XEC(m->status, 0x3f) == 0x0))) 4970 return NOTIFY_DONE; 4971 4972 /* 4973 * If it is correctable error, return. 4974 */ 4975 if (mce_is_correctable(m)) 4976 return NOTIFY_OK; 4977 4978 /* 4979 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. 4980 */ 4981 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; 4982 4983 adev = find_adev(gpu_id); 4984 if (!adev) { 4985 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__, 4986 gpu_id); 4987 return NOTIFY_DONE; 4988 } 4989 4990 /* 4991 * If it is uncorrectable error, then find out UMC instance and 4992 * channel index. 4993 */ 4994 umc_inst = GET_UMC_INST(m->ipid); 4995 ch_inst = GET_CHAN_INDEX(m->ipid); 4996 4997 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", 4998 umc_inst, ch_inst); 4999 5000 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) 5001 return NOTIFY_OK; 5002 else 5003 return NOTIFY_DONE; 5004 } 5005 5006 static struct notifier_block amdgpu_bad_page_nb = { 5007 .notifier_call = amdgpu_bad_page_notifier, 5008 .priority = MCE_PRIO_UC, 5009 }; 5010 5011 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) 5012 { 5013 /* 5014 * Add the adev to the mce_adev_list. 5015 * During mode2 reset, amdgpu device is temporarily 5016 * removed from the mgpu_info list which can cause 5017 * page retirement to fail. 5018 * Use this list instead of mgpu_info to find the amdgpu 5019 * device on which the UMC error was reported. 5020 */ 5021 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; 5022 5023 /* 5024 * Register the x86 notifier only once 5025 * with MCE subsystem. 5026 */ 5027 if (notifier_registered == false) { 5028 mce_register_decode_chain(&amdgpu_bad_page_nb); 5029 notifier_registered = true; 5030 } 5031 } 5032 static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev) 5033 { 5034 int i, j; 5035 5036 if (!notifier_registered && !mce_adev_list.num_gpu) 5037 return; 5038 for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) { 5039 if (mce_adev_list.devs[i] == adev) 5040 mce_adev_list.devs[i] = NULL; 5041 if (!mce_adev_list.devs[i]) 5042 ++j; 5043 } 5044 5045 if (j == mce_adev_list.num_gpu) { 5046 mce_adev_list.num_gpu = 0; 5047 /* Unregister x86 notifier with MCE subsystem. */ 5048 if (notifier_registered) { 5049 mce_unregister_decode_chain(&amdgpu_bad_page_nb); 5050 notifier_registered = false; 5051 } 5052 } 5053 } 5054 #endif 5055 5056 struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) 5057 { 5058 if (!adev) 5059 return NULL; 5060 5061 return adev->psp.ras_context.ras; 5062 } 5063 5064 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) 5065 { 5066 if (!adev) 5067 return -EINVAL; 5068 5069 adev->psp.ras_context.ras = ras_con; 5070 return 0; 5071 } 5072 5073 /* check if ras is supported on block, say, sdma, gfx */ 5074 int amdgpu_ras_is_supported(struct amdgpu_device *adev, 5075 unsigned int block) 5076 { 5077 int ret = 0; 5078 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5079 5080 if (block >= AMDGPU_RAS_BLOCK_COUNT) 5081 return 0; 5082 5083 ret = ras && (adev->ras_enabled & (1 << block)); 5084 5085 /* For the special asic with mem ecc enabled but sram ecc 5086 * not enabled, even if the ras block is not supported on 5087 * .ras_enabled, if the asic supports poison mode and the 5088 * ras block has ras configuration, it can be considered 5089 * that the ras block supports ras function. 5090 */ 5091 if (!ret && 5092 (block == AMDGPU_RAS_BLOCK__GFX || 5093 block == AMDGPU_RAS_BLOCK__SDMA || 5094 block == AMDGPU_RAS_BLOCK__VCN || 5095 block == AMDGPU_RAS_BLOCK__JPEG) && 5096 (amdgpu_ras_mask & (1 << block)) && 5097 amdgpu_ras_is_poison_mode_supported(adev) && 5098 amdgpu_ras_get_ras_block(adev, block, 0)) 5099 ret = 1; 5100 5101 return ret; 5102 } 5103 5104 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) 5105 { 5106 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5107 5108 /* mode1 is the only selection for RMA status */ 5109 if (amdgpu_ras_is_rma(adev)) { 5110 ras->gpu_reset_flags = 0; 5111 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 5112 } 5113 5114 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { 5115 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 5116 int hive_ras_recovery = 0; 5117 5118 if (hive) { 5119 hive_ras_recovery = atomic_read(&hive->ras_recovery); 5120 amdgpu_put_xgmi_hive(hive); 5121 } 5122 /* In the case of multiple GPUs, after a GPU has started 5123 * resetting all GPUs on hive, other GPUs do not need to 5124 * trigger GPU reset again. 5125 */ 5126 if (!hive_ras_recovery) 5127 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 5128 else 5129 atomic_set(&ras->in_recovery, 0); 5130 } else { 5131 flush_work(&ras->recovery_work); 5132 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 5133 } 5134 5135 return 0; 5136 } 5137 5138 int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) 5139 { 5140 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5141 int ret = 0; 5142 5143 if (con) { 5144 ret = amdgpu_mca_smu_set_debug_mode(adev, enable); 5145 if (!ret) 5146 con->is_aca_debug_mode = enable; 5147 } 5148 5149 return ret; 5150 } 5151 5152 int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) 5153 { 5154 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5155 int ret = 0; 5156 5157 if (con) { 5158 if (amdgpu_aca_is_enabled(adev)) 5159 ret = amdgpu_aca_smu_set_debug_mode(adev, enable); 5160 else 5161 ret = amdgpu_mca_smu_set_debug_mode(adev, enable); 5162 if (!ret) 5163 con->is_aca_debug_mode = enable; 5164 } 5165 5166 return ret; 5167 } 5168 5169 bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev) 5170 { 5171 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5172 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 5173 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 5174 5175 if (!con) 5176 return false; 5177 5178 if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || 5179 (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) 5180 return con->is_aca_debug_mode; 5181 else 5182 return true; 5183 } 5184 5185 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, 5186 unsigned int *error_query_mode) 5187 { 5188 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5189 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 5190 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 5191 5192 if (!con) { 5193 *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; 5194 return false; 5195 } 5196 5197 if (amdgpu_sriov_vf(adev)) { 5198 *error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY; 5199 } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) { 5200 *error_query_mode = 5201 (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; 5202 } else { 5203 *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; 5204 } 5205 5206 return true; 5207 } 5208 5209 /* Register each ip ras block into amdgpu ras */ 5210 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, 5211 struct amdgpu_ras_block_object *ras_block_obj) 5212 { 5213 struct amdgpu_ras_block_list *ras_node; 5214 if (!adev || !ras_block_obj) 5215 return -EINVAL; 5216 5217 ras_node = kzalloc_obj(*ras_node); 5218 if (!ras_node) 5219 return -ENOMEM; 5220 5221 INIT_LIST_HEAD(&ras_node->node); 5222 ras_node->ras_obj = ras_block_obj; 5223 list_add_tail(&ras_node->node, &adev->ras_list); 5224 5225 return 0; 5226 } 5227 5228 void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) 5229 { 5230 if (!err_type_name) 5231 return; 5232 5233 switch (err_type) { 5234 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: 5235 sprintf(err_type_name, "correctable"); 5236 break; 5237 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: 5238 sprintf(err_type_name, "uncorrectable"); 5239 break; 5240 default: 5241 sprintf(err_type_name, "unknown"); 5242 break; 5243 } 5244 } 5245 5246 bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, 5247 const struct amdgpu_ras_err_status_reg_entry *reg_entry, 5248 uint32_t instance, 5249 uint32_t *memory_id) 5250 { 5251 uint32_t err_status_lo_data, err_status_lo_offset; 5252 5253 if (!reg_entry) 5254 return false; 5255 5256 err_status_lo_offset = 5257 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 5258 reg_entry->seg_lo, reg_entry->reg_lo); 5259 err_status_lo_data = RREG32(err_status_lo_offset); 5260 5261 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && 5262 !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) 5263 return false; 5264 5265 *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); 5266 5267 return true; 5268 } 5269 5270 bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, 5271 const struct amdgpu_ras_err_status_reg_entry *reg_entry, 5272 uint32_t instance, 5273 unsigned long *err_cnt) 5274 { 5275 uint32_t err_status_hi_data, err_status_hi_offset; 5276 5277 if (!reg_entry) 5278 return false; 5279 5280 err_status_hi_offset = 5281 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 5282 reg_entry->seg_hi, reg_entry->reg_hi); 5283 err_status_hi_data = RREG32(err_status_hi_offset); 5284 5285 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && 5286 !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) 5287 /* keep the check here in case we need to refer to the result later */ 5288 dev_dbg(adev->dev, "Invalid err_info field\n"); 5289 5290 /* read err count */ 5291 *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); 5292 5293 return true; 5294 } 5295 5296 void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, 5297 const struct amdgpu_ras_err_status_reg_entry *reg_list, 5298 uint32_t reg_list_size, 5299 const struct amdgpu_ras_memory_id_entry *mem_list, 5300 uint32_t mem_list_size, 5301 uint32_t instance, 5302 uint32_t err_type, 5303 unsigned long *err_count) 5304 { 5305 uint32_t memory_id; 5306 unsigned long err_cnt; 5307 char err_type_name[16]; 5308 uint32_t i, j; 5309 5310 for (i = 0; i < reg_list_size; i++) { 5311 /* query memory_id from err_status_lo */ 5312 if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], 5313 instance, &memory_id)) 5314 continue; 5315 5316 /* query err_cnt from err_status_hi */ 5317 if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], 5318 instance, &err_cnt) || 5319 !err_cnt) 5320 continue; 5321 5322 *err_count += err_cnt; 5323 5324 /* log the errors */ 5325 amdgpu_ras_get_error_type_name(err_type, err_type_name); 5326 if (!mem_list) { 5327 /* memory_list is not supported */ 5328 dev_info(adev->dev, 5329 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", 5330 err_cnt, err_type_name, 5331 reg_list[i].block_name, 5332 instance, memory_id); 5333 } else { 5334 for (j = 0; j < mem_list_size; j++) { 5335 if (memory_id == mem_list[j].memory_id) { 5336 dev_info(adev->dev, 5337 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", 5338 err_cnt, err_type_name, 5339 reg_list[i].block_name, 5340 instance, mem_list[j].name); 5341 break; 5342 } 5343 } 5344 } 5345 } 5346 } 5347 5348 void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, 5349 const struct amdgpu_ras_err_status_reg_entry *reg_list, 5350 uint32_t reg_list_size, 5351 uint32_t instance) 5352 { 5353 uint32_t err_status_lo_offset, err_status_hi_offset; 5354 uint32_t i; 5355 5356 for (i = 0; i < reg_list_size; i++) { 5357 err_status_lo_offset = 5358 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 5359 reg_list[i].seg_lo, reg_list[i].reg_lo); 5360 err_status_hi_offset = 5361 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 5362 reg_list[i].seg_hi, reg_list[i].reg_hi); 5363 WREG32(err_status_lo_offset, 0); 5364 WREG32(err_status_hi_offset, 0); 5365 } 5366 } 5367 5368 int amdgpu_ras_error_data_init(struct ras_err_data *err_data) 5369 { 5370 memset(err_data, 0, sizeof(*err_data)); 5371 5372 INIT_LIST_HEAD(&err_data->err_node_list); 5373 5374 return 0; 5375 } 5376 5377 static void amdgpu_ras_error_node_release(struct ras_err_node *err_node) 5378 { 5379 if (!err_node) 5380 return; 5381 5382 list_del(&err_node->node); 5383 kvfree(err_node); 5384 } 5385 5386 void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) 5387 { 5388 struct ras_err_node *err_node, *tmp; 5389 5390 list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) 5391 amdgpu_ras_error_node_release(err_node); 5392 } 5393 5394 static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, 5395 struct amdgpu_smuio_mcm_config_info *mcm_info) 5396 { 5397 struct ras_err_node *err_node; 5398 struct amdgpu_smuio_mcm_config_info *ref_id; 5399 5400 if (!err_data || !mcm_info) 5401 return NULL; 5402 5403 for_each_ras_error(err_node, err_data) { 5404 ref_id = &err_node->err_info.mcm_info; 5405 5406 if (mcm_info->socket_id == ref_id->socket_id && 5407 mcm_info->die_id == ref_id->die_id) 5408 return err_node; 5409 } 5410 5411 return NULL; 5412 } 5413 5414 static struct ras_err_node *amdgpu_ras_error_node_new(void) 5415 { 5416 struct ras_err_node *err_node; 5417 5418 err_node = kvzalloc_obj(*err_node); 5419 if (!err_node) 5420 return NULL; 5421 5422 INIT_LIST_HEAD(&err_node->node); 5423 5424 return err_node; 5425 } 5426 5427 static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b) 5428 { 5429 struct ras_err_node *nodea = container_of(a, struct ras_err_node, node); 5430 struct ras_err_node *nodeb = container_of(b, struct ras_err_node, node); 5431 struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; 5432 struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; 5433 5434 if (unlikely(infoa->socket_id != infob->socket_id)) 5435 return infoa->socket_id - infob->socket_id; 5436 else 5437 return infoa->die_id - infob->die_id; 5438 5439 return 0; 5440 } 5441 5442 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, 5443 struct amdgpu_smuio_mcm_config_info *mcm_info) 5444 { 5445 struct ras_err_node *err_node; 5446 5447 err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info); 5448 if (err_node) 5449 return &err_node->err_info; 5450 5451 err_node = amdgpu_ras_error_node_new(); 5452 if (!err_node) 5453 return NULL; 5454 5455 memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); 5456 5457 err_data->err_list_count++; 5458 list_add_tail(&err_node->node, &err_data->err_node_list); 5459 list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); 5460 5461 return &err_node->err_info; 5462 } 5463 5464 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, 5465 struct amdgpu_smuio_mcm_config_info *mcm_info, 5466 u64 count) 5467 { 5468 struct ras_err_info *err_info; 5469 5470 if (!err_data || !mcm_info) 5471 return -EINVAL; 5472 5473 if (!count) 5474 return 0; 5475 5476 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5477 if (!err_info) 5478 return -EINVAL; 5479 5480 err_info->ue_count += count; 5481 err_data->ue_count += count; 5482 5483 return 0; 5484 } 5485 5486 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, 5487 struct amdgpu_smuio_mcm_config_info *mcm_info, 5488 u64 count) 5489 { 5490 struct ras_err_info *err_info; 5491 5492 if (!err_data || !mcm_info) 5493 return -EINVAL; 5494 5495 if (!count) 5496 return 0; 5497 5498 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5499 if (!err_info) 5500 return -EINVAL; 5501 5502 err_info->ce_count += count; 5503 err_data->ce_count += count; 5504 5505 return 0; 5506 } 5507 5508 int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, 5509 struct amdgpu_smuio_mcm_config_info *mcm_info, 5510 u64 count) 5511 { 5512 struct ras_err_info *err_info; 5513 5514 if (!err_data || !mcm_info) 5515 return -EINVAL; 5516 5517 if (!count) 5518 return 0; 5519 5520 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5521 if (!err_info) 5522 return -EINVAL; 5523 5524 err_info->de_count += count; 5525 err_data->de_count += count; 5526 5527 return 0; 5528 } 5529 5530 #define mmMP0_SMN_C2PMSG_92 0x1609C 5531 #define mmMP0_SMN_C2PMSG_126 0x160BE 5532 static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, 5533 u32 instance) 5534 { 5535 u32 socket_id, aid_id, hbm_id; 5536 u32 fw_status; 5537 u32 boot_error; 5538 u64 reg_addr; 5539 5540 /* The pattern for smn addressing in other SOC could be different from 5541 * the one for aqua_vanjaram. We should revisit the code if the pattern 5542 * is changed. In such case, replace the aqua_vanjaram implementation 5543 * with more common helper */ 5544 reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + 5545 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5546 fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5547 5548 reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + 5549 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5550 boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5551 5552 socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); 5553 aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); 5554 hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1); 5555 5556 if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) 5557 dev_info(adev->dev, 5558 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", 5559 socket_id, aid_id, hbm_id, fw_status); 5560 5561 if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) 5562 dev_info(adev->dev, 5563 "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", 5564 socket_id, aid_id, fw_status); 5565 5566 if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) 5567 dev_info(adev->dev, 5568 "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", 5569 socket_id, aid_id, fw_status); 5570 5571 if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) 5572 dev_info(adev->dev, 5573 "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", 5574 socket_id, aid_id, fw_status); 5575 5576 if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) 5577 dev_info(adev->dev, 5578 "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", 5579 socket_id, aid_id, fw_status); 5580 5581 if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) 5582 dev_info(adev->dev, 5583 "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", 5584 socket_id, aid_id, fw_status); 5585 5586 if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) 5587 dev_info(adev->dev, 5588 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", 5589 socket_id, aid_id, hbm_id, fw_status); 5590 5591 if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) 5592 dev_info(adev->dev, 5593 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", 5594 socket_id, aid_id, hbm_id, fw_status); 5595 5596 if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error)) 5597 dev_info(adev->dev, 5598 "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n", 5599 socket_id, aid_id, fw_status); 5600 5601 if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error)) 5602 dev_info(adev->dev, 5603 "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n", 5604 socket_id, aid_id, fw_status); 5605 } 5606 5607 static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, 5608 u32 instance) 5609 { 5610 u64 reg_addr; 5611 u32 reg_data; 5612 int retry_loop; 5613 5614 reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + 5615 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5616 5617 for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { 5618 reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5619 if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) 5620 return false; 5621 else 5622 msleep(1); 5623 } 5624 5625 return true; 5626 } 5627 5628 void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) 5629 { 5630 u32 i; 5631 5632 for (i = 0; i < num_instances; i++) { 5633 if (amdgpu_ras_boot_error_detected(adev, i)) 5634 amdgpu_ras_boot_time_error_reporting(adev, i); 5635 } 5636 } 5637 5638 int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) 5639 { 5640 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5641 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; 5642 uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; 5643 int ret = 0; 5644 5645 if (amdgpu_ras_check_critical_address(adev, start)) 5646 return 0; 5647 5648 mutex_lock(&con->page_rsv_lock); 5649 ret = amdgpu_vram_mgr_query_page_status(mgr, start); 5650 if (ret == -ENOENT) 5651 ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE); 5652 mutex_unlock(&con->page_rsv_lock); 5653 5654 return ret; 5655 } 5656 5657 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, 5658 const char *fmt, ...) 5659 { 5660 struct va_format vaf; 5661 va_list args; 5662 5663 va_start(args, fmt); 5664 vaf.fmt = fmt; 5665 vaf.va = &args; 5666 5667 if (RAS_EVENT_ID_IS_VALID(event_id)) 5668 dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); 5669 else 5670 dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); 5671 5672 va_end(args); 5673 } 5674 5675 bool amdgpu_ras_is_rma(struct amdgpu_device *adev) 5676 { 5677 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5678 5679 if (amdgpu_uniras_enabled(adev)) 5680 return amdgpu_ras_mgr_is_rma(adev); 5681 5682 if (!con) 5683 return false; 5684 5685 return con->is_rma; 5686 } 5687 5688 int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, 5689 struct amdgpu_bo *bo) 5690 { 5691 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5692 struct amdgpu_vram_mgr_resource *vres; 5693 struct ras_critical_region *region; 5694 struct gpu_buddy_block *block; 5695 int ret = 0; 5696 5697 if (!bo || !bo->tbo.resource) 5698 return -EINVAL; 5699 5700 vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); 5701 5702 mutex_lock(&con->critical_region_lock); 5703 5704 /* Check if the bo had been recorded */ 5705 list_for_each_entry(region, &con->critical_region_head, node) 5706 if (region->bo == bo) 5707 goto out; 5708 5709 /* Record new critical amdgpu bo */ 5710 list_for_each_entry(block, &vres->blocks, link) { 5711 region = kzalloc_obj(*region); 5712 if (!region) { 5713 ret = -ENOMEM; 5714 goto out; 5715 } 5716 region->bo = bo; 5717 region->start = amdgpu_vram_mgr_block_start(block); 5718 region->size = amdgpu_vram_mgr_block_size(block); 5719 list_add_tail(®ion->node, &con->critical_region_head); 5720 } 5721 5722 out: 5723 mutex_unlock(&con->critical_region_lock); 5724 5725 return ret; 5726 } 5727 5728 static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) 5729 { 5730 amdgpu_ras_add_critical_region(adev, adev->mman.resv_region[AMDGPU_RESV_FW].bo); 5731 } 5732 5733 static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) 5734 { 5735 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5736 struct ras_critical_region *region, *tmp; 5737 5738 mutex_lock(&con->critical_region_lock); 5739 list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { 5740 list_del(®ion->node); 5741 kfree(region); 5742 } 5743 mutex_unlock(&con->critical_region_lock); 5744 } 5745 5746 bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) 5747 { 5748 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5749 struct ras_critical_region *region; 5750 bool ret = false; 5751 5752 mutex_lock(&con->critical_region_lock); 5753 list_for_each_entry(region, &con->critical_region_head, node) { 5754 if ((region->start <= addr) && 5755 (addr < (region->start + region->size))) { 5756 ret = true; 5757 break; 5758 } 5759 } 5760 mutex_unlock(&con->critical_region_lock); 5761 5762 return ret; 5763 } 5764 5765 void amdgpu_ras_pre_reset(struct amdgpu_device *adev, 5766 struct list_head *device_list) 5767 { 5768 struct amdgpu_device *tmp_adev = NULL; 5769 5770 list_for_each_entry(tmp_adev, device_list, reset_list) { 5771 if (amdgpu_uniras_enabled(tmp_adev)) 5772 amdgpu_ras_mgr_pre_reset(tmp_adev); 5773 } 5774 } 5775 5776 void amdgpu_ras_post_reset(struct amdgpu_device *adev, 5777 struct list_head *device_list) 5778 { 5779 struct amdgpu_device *tmp_adev = NULL; 5780 5781 list_for_each_entry(tmp_adev, device_list, reset_list) { 5782 if (amdgpu_uniras_enabled(tmp_adev)) 5783 amdgpu_ras_mgr_post_reset(tmp_adev); 5784 } 5785 } 5786