1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include "amdgpu_reg_access.h" 25 #include <linux/debugfs.h> 26 #include <linux/list.h> 27 #include <linux/module.h> 28 #include <linux/uaccess.h> 29 #include <linux/reboot.h> 30 #include <linux/syscalls.h> 31 #include <linux/pm_runtime.h> 32 #include <linux/list_sort.h> 33 34 #include "amdgpu.h" 35 #include "amdgpu_ras.h" 36 #include "amdgpu_atomfirmware.h" 37 #include "amdgpu_xgmi.h" 38 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" 39 #include "nbio_v4_3.h" 40 #include "nbif_v6_3_1.h" 41 #include "nbio_v7_9.h" 42 #include "atom.h" 43 #include "amdgpu_reset.h" 44 #include "amdgpu_psp.h" 45 #include "amdgpu_ras_mgr.h" 46 #include "amdgpu_virt_ras_cmd.h" 47 48 #ifdef CONFIG_X86_MCE_AMD 49 #include <asm/mce.h> 50 51 static bool notifier_registered; 52 #endif 53 static const char *RAS_FS_NAME = "ras"; 54 55 const char *ras_error_string[] = { 56 "none", 57 "parity", 58 "single_correctable", 59 "multi_uncorrectable", 60 "poison", 61 }; 62 63 const char *ras_block_string[] = { 64 "umc", 65 "sdma", 66 "gfx", 67 "mmhub", 68 "athub", 69 "pcie_bif", 70 "hdp", 71 "xgmi_wafl", 72 "df", 73 "smn", 74 "sem", 75 "mp0", 76 "mp1", 77 "fuse", 78 "mca", 79 "vcn", 80 "jpeg", 81 "ih", 82 "mpio", 83 "mmsch", 84 }; 85 86 const char *ras_mca_block_string[] = { 87 "mca_mp0", 88 "mca_mp1", 89 "mca_mpio", 90 "mca_iohc", 91 }; 92 93 struct amdgpu_ras_block_list { 94 /* ras block link */ 95 struct list_head node; 96 97 struct amdgpu_ras_block_object *ras_obj; 98 }; 99 100 const char *get_ras_block_str(struct ras_common_if *ras_block) 101 { 102 if (!ras_block) 103 return "NULL"; 104 105 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT || 106 ras_block->block >= ARRAY_SIZE(ras_block_string)) 107 return "OUT OF RANGE"; 108 109 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) 110 return ras_mca_block_string[ras_block->sub_block_index]; 111 112 return ras_block_string[ras_block->block]; 113 } 114 115 #define ras_block_str(_BLOCK_) \ 116 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") 117 118 #define ras_err_str(i) (ras_error_string[ffs(i)]) 119 120 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 121 122 /* inject address is 52 bits */ 123 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) 124 125 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 126 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) 127 128 #define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 129 130 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms 131 132 #define MAX_FLUSH_RETIRE_DWORK_TIMES 100 133 134 #define BYPASS_ALLOCATED_ADDRESS 0x0 135 #define BYPASS_INITIALIZATION_ADDRESS 0x1 136 137 enum amdgpu_ras_retire_page_reservation { 138 AMDGPU_RAS_RETIRE_PAGE_RESERVED, 139 AMDGPU_RAS_RETIRE_PAGE_PENDING, 140 AMDGPU_RAS_RETIRE_PAGE_FAULT, 141 }; 142 143 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); 144 145 static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 146 uint64_t addr); 147 static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 148 uint64_t addr); 149 150 static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); 151 static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); 152 153 #ifdef CONFIG_X86_MCE_AMD 154 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); 155 static void 156 amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev); 157 struct mce_notifier_adev_list { 158 struct amdgpu_device *devs[MAX_GPU_INSTANCE]; 159 int num_gpu; 160 }; 161 static struct mce_notifier_adev_list mce_adev_list; 162 #endif 163 164 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) 165 { 166 if (adev && amdgpu_ras_get_context(adev)) 167 amdgpu_ras_get_context(adev)->error_query_ready = ready; 168 } 169 170 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) 171 { 172 if (adev && amdgpu_ras_get_context(adev)) 173 return amdgpu_ras_get_context(adev)->error_query_ready; 174 175 return false; 176 } 177 178 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) 179 { 180 struct ras_err_data err_data; 181 struct eeprom_table_record err_rec; 182 int ret; 183 184 ret = amdgpu_ras_check_bad_page(adev, address); 185 if (ret == -EINVAL) { 186 dev_warn(adev->dev, 187 "RAS WARN: input address 0x%llx is invalid.\n", 188 address); 189 return -EINVAL; 190 } else if (ret == 1) { 191 dev_warn(adev->dev, 192 "RAS WARN: 0x%llx has already been marked as bad page!\n", 193 address); 194 return 0; 195 } 196 197 ret = amdgpu_ras_error_data_init(&err_data); 198 if (ret) 199 return ret; 200 201 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); 202 err_data.err_addr = &err_rec; 203 amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); 204 205 if (amdgpu_bad_page_threshold != 0) { 206 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 207 err_data.err_addr_cnt, false); 208 amdgpu_ras_save_bad_pages(adev, NULL); 209 } 210 211 amdgpu_ras_error_data_fini(&err_data); 212 213 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); 214 dev_warn(adev->dev, "Clear EEPROM:\n"); 215 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); 216 217 return 0; 218 } 219 220 static int amdgpu_check_address_validity(struct amdgpu_device *adev, 221 uint64_t address, uint64_t flags) 222 { 223 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 224 struct amdgpu_vram_block_info blk_info; 225 uint64_t page_pfns[32] = {0}; 226 int i, ret, count; 227 bool hit = false; 228 229 if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) 230 return 0; 231 232 if (amdgpu_sriov_vf(adev)) { 233 if (amdgpu_uniras_enabled(adev)) { 234 if (amdgpu_virt_ras_check_address_validity(adev, address, &hit)) 235 return -EPERM; 236 if (hit) 237 return -EACCES; 238 } else { 239 if (amdgpu_virt_check_vf_critical_region(adev, address, &hit)) 240 return -EPERM; 241 return hit ? -EACCES : 0; 242 } 243 } 244 245 if ((address >= adev->gmc.mc_vram_size) || 246 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) 247 return -EFAULT; 248 249 if (amdgpu_uniras_enabled(adev)) { 250 if (amdgpu_sriov_vf(adev)) 251 count = amdgpu_virt_ras_convert_retired_address(adev, address, 252 page_pfns, ARRAY_SIZE(page_pfns)); 253 else 254 count = amdgpu_ras_mgr_lookup_bad_pages_in_a_row(adev, address, 255 page_pfns, ARRAY_SIZE(page_pfns)); 256 } else 257 count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, 258 address, page_pfns, ARRAY_SIZE(page_pfns)); 259 260 if (count <= 0) 261 return -EPERM; 262 263 for (i = 0; i < count; i++) { 264 memset(&blk_info, 0, sizeof(blk_info)); 265 ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr, 266 page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info); 267 if (!ret) { 268 /* The input address that needs to be checked is allocated by 269 * current calling process, so it is necessary to exclude 270 * the calling process. 271 */ 272 if ((flags == BYPASS_ALLOCATED_ADDRESS) && 273 ((blk_info.task.pid != task_pid_nr(current)) || 274 strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN))) 275 return -EACCES; 276 else if ((flags == BYPASS_INITIALIZATION_ADDRESS) && 277 (blk_info.task.pid == con->init_task_pid) && 278 !strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN)) 279 return -EACCES; 280 } 281 } 282 283 return 0; 284 } 285 286 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 287 size_t size, loff_t *pos) 288 { 289 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 290 struct ras_query_if info = { 291 .head = obj->head, 292 }; 293 ssize_t s; 294 char val[128]; 295 296 if (amdgpu_ras_query_error_status(obj->adev, &info)) 297 return -EINVAL; 298 299 /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ 300 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 301 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 302 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 303 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 304 } 305 306 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 307 "ue", info.ue_count, 308 "ce", info.ce_count); 309 if (*pos >= s) 310 return 0; 311 312 s -= *pos; 313 s = min_t(u64, s, size); 314 315 316 if (copy_to_user(buf, &val[*pos], s)) 317 return -EINVAL; 318 319 *pos += s; 320 321 return s; 322 } 323 324 static const struct file_operations amdgpu_ras_debugfs_ops = { 325 .owner = THIS_MODULE, 326 .read = amdgpu_ras_debugfs_read, 327 .write = NULL, 328 .llseek = default_llseek 329 }; 330 331 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) 332 { 333 int i; 334 335 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 336 *block_id = i; 337 if (strcmp(name, ras_block_string[i]) == 0) 338 return 0; 339 } 340 return -EINVAL; 341 } 342 343 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, 344 const char __user *buf, size_t size, 345 loff_t *pos, struct ras_debug_if *data) 346 { 347 ssize_t s = min_t(u64, 64, size); 348 char str[65]; 349 char block_name[33]; 350 char err[9] = "ue"; 351 int op = -1; 352 int block_id; 353 uint32_t sub_block; 354 u64 address, value; 355 /* default value is 0 if the mask is not set by user */ 356 u32 instance_mask = 0; 357 358 if (*pos) 359 return -EINVAL; 360 *pos = size; 361 362 memset(str, 0, sizeof(str)); 363 memset(data, 0, sizeof(*data)); 364 365 if (copy_from_user(str, buf, s)) 366 return -EINVAL; 367 368 if (sscanf(str, "disable %32s", block_name) == 1) 369 op = 0; 370 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2) 371 op = 1; 372 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) 373 op = 2; 374 else if (strstr(str, "retire_page") != NULL) 375 op = 3; 376 else if (strstr(str, "check_address") != NULL) 377 op = 4; 378 else if (str[0] && str[1] && str[2] && str[3]) 379 /* ascii string, but commands are not matched. */ 380 return -EINVAL; 381 382 if (op != -1) { 383 if (op == 3) { 384 if (sscanf(str, "%*s 0x%llx", &address) != 1 && 385 sscanf(str, "%*s %llu", &address) != 1) 386 return -EINVAL; 387 388 data->op = op; 389 data->inject.address = address; 390 391 return 0; 392 } else if (op == 4) { 393 if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 && 394 sscanf(str, "%*s %llu %llu", &address, &value) != 2) 395 return -EINVAL; 396 397 data->op = op; 398 data->inject.address = address; 399 data->inject.value = value; 400 return 0; 401 } 402 403 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) 404 return -EINVAL; 405 406 data->head.block = block_id; 407 /* only ue, ce and poison errors are supported */ 408 if (!memcmp("ue", err, 2)) 409 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 410 else if (!memcmp("ce", err, 2)) 411 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 412 else if (!memcmp("poison", err, 6)) 413 data->head.type = AMDGPU_RAS_ERROR__POISON; 414 else 415 return -EINVAL; 416 417 data->op = op; 418 419 if (op == 2) { 420 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", 421 &sub_block, &address, &value, &instance_mask) != 4 && 422 sscanf(str, "%*s %*s %*s %u %llu %llu %u", 423 &sub_block, &address, &value, &instance_mask) != 4 && 424 sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", 425 &sub_block, &address, &value) != 3 && 426 sscanf(str, "%*s %*s %*s %u %llu %llu", 427 &sub_block, &address, &value) != 3) 428 return -EINVAL; 429 data->head.sub_block_index = sub_block; 430 data->inject.address = address; 431 data->inject.value = value; 432 data->inject.instance_mask = instance_mask; 433 } 434 } else { 435 if (size < sizeof(*data)) 436 return -EINVAL; 437 438 if (copy_from_user(data, buf, sizeof(*data))) 439 return -EINVAL; 440 } 441 442 return 0; 443 } 444 445 static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, 446 struct ras_debug_if *data) 447 { 448 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; 449 uint32_t mask, inst_mask = data->inject.instance_mask; 450 451 /* no need to set instance mask if there is only one instance */ 452 if (num_xcc <= 1 && inst_mask) { 453 data->inject.instance_mask = 0; 454 dev_dbg(adev->dev, 455 "RAS inject mask(0x%x) isn't supported and force it to 0.\n", 456 inst_mask); 457 458 return; 459 } 460 461 switch (data->head.block) { 462 case AMDGPU_RAS_BLOCK__GFX: 463 mask = GENMASK(num_xcc - 1, 0); 464 break; 465 case AMDGPU_RAS_BLOCK__SDMA: 466 mask = GENMASK(adev->sdma.num_instances - 1, 0); 467 break; 468 case AMDGPU_RAS_BLOCK__VCN: 469 case AMDGPU_RAS_BLOCK__JPEG: 470 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); 471 break; 472 default: 473 mask = inst_mask; 474 break; 475 } 476 477 /* remove invalid bits in instance mask */ 478 data->inject.instance_mask &= mask; 479 if (inst_mask != data->inject.instance_mask) 480 dev_dbg(adev->dev, 481 "Adjust RAS inject mask 0x%x to 0x%x\n", 482 inst_mask, data->inject.instance_mask); 483 } 484 485 /** 486 * DOC: AMDGPU RAS debugfs control interface 487 * 488 * The control interface accepts struct ras_debug_if which has two members. 489 * 490 * First member: ras_debug_if::head or ras_debug_if::inject. 491 * 492 * head is used to indicate which IP block will be under control. 493 * 494 * head has four members, they are block, type, sub_block_index, name. 495 * block: which IP will be under control. 496 * type: what kind of error will be enabled/disabled/injected. 497 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. 498 * name: the name of IP. 499 * 500 * inject has three more members than head, they are address, value and mask. 501 * As their names indicate, inject operation will write the 502 * value to the address. 503 * 504 * The second member: struct ras_debug_if::op. 505 * It has three kinds of operations. 506 * 507 * - 0: disable RAS on the block. Take ::head as its data. 508 * - 1: enable RAS on the block. Take ::head as its data. 509 * - 2: inject errors on the block. Take ::inject as its data. 510 * 511 * How to use the interface? 512 * 513 * In a program 514 * 515 * Copy the struct ras_debug_if in your code and initialize it. 516 * Write the struct to the control interface. 517 * 518 * From shell 519 * 520 * .. code-block:: bash 521 * 522 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 523 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 524 * echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl 525 * 526 * Where N, is the card which you want to affect. 527 * 528 * "disable" requires only the block. 529 * "enable" requires the block and error type. 530 * "inject" requires the block, error type, address, and value. 531 * 532 * The block is one of: umc, sdma, gfx, etc. 533 * see ras_block_string[] for details 534 * 535 * The error type is one of: ue, ce and poison where, 536 * ue is multi-uncorrectable 537 * ce is single-correctable 538 * poison is poison 539 * 540 * The sub-block is a the sub-block index, pass 0 if there is no sub-block. 541 * The address and value are hexadecimal numbers, leading 0x is optional. 542 * The mask means instance mask, is optional, default value is 0x1. 543 * 544 * For instance, 545 * 546 * .. code-block:: bash 547 * 548 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl 549 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl 550 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl 551 * 552 * How to check the result of the operation? 553 * 554 * To check disable/enable, see "ras" features at, 555 * /sys/class/drm/card[0/1/2...]/device/ras/features 556 * 557 * To check inject, see the corresponding error count at, 558 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count 559 * 560 * .. note:: 561 * Operations are only allowed on blocks which are supported. 562 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask 563 * to see which blocks support RAS on a particular asic. 564 * 565 */ 566 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, 567 const char __user *buf, 568 size_t size, loff_t *pos) 569 { 570 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; 571 struct ras_debug_if data; 572 int ret = 0; 573 574 if (!amdgpu_ras_get_error_query_ready(adev)) { 575 dev_warn(adev->dev, "RAS WARN: error injection " 576 "currently inaccessible\n"); 577 return size; 578 } 579 580 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); 581 if (ret) 582 return ret; 583 584 if (data.op == 3) { 585 ret = amdgpu_reserve_page_direct(adev, data.inject.address); 586 if (!ret) 587 return size; 588 else 589 return ret; 590 } else if (data.op == 4) { 591 ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value); 592 return ret ? ret : size; 593 } 594 595 if (!amdgpu_ras_is_supported(adev, data.head.block)) 596 return -EINVAL; 597 598 switch (data.op) { 599 case 0: 600 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); 601 break; 602 case 1: 603 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); 604 break; 605 case 2: 606 /* umc ce/ue error injection for a bad page is not allowed */ 607 if (data.head.block == AMDGPU_RAS_BLOCK__UMC) 608 ret = amdgpu_ras_check_bad_page(adev, data.inject.address); 609 if (ret == -EINVAL) { 610 dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", 611 data.inject.address); 612 break; 613 } else if (ret == 1) { 614 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", 615 data.inject.address); 616 break; 617 } 618 619 amdgpu_ras_instance_mask_check(adev, &data); 620 621 /* data.inject.address is offset instead of absolute gpu address */ 622 ret = amdgpu_ras_error_inject(adev, &data.inject); 623 break; 624 default: 625 ret = -EINVAL; 626 break; 627 } 628 629 if (ret) 630 return ret; 631 632 return size; 633 } 634 635 static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev); 636 637 /** 638 * DOC: AMDGPU RAS debugfs EEPROM table reset interface 639 * 640 * Some boards contain an EEPROM which is used to persistently store a list of 641 * bad pages which experiences ECC errors in vram. This interface provides 642 * a way to reset the EEPROM, e.g., after testing error injection. 643 * 644 * Usage: 645 * 646 * .. code-block:: bash 647 * 648 * echo 1 > ../ras/ras_eeprom_reset 649 * 650 * will reset EEPROM table to 0 entries. 651 * 652 */ 653 static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, 654 const char __user *buf, 655 size_t size, loff_t *pos) 656 { 657 struct amdgpu_device *adev = 658 (struct amdgpu_device *)file_inode(f)->i_private; 659 int ret; 660 661 if (amdgpu_uniras_enabled(adev)) { 662 ret = amdgpu_uniras_clear_badpages_info(adev); 663 return ret ? ret : size; 664 } 665 666 ret = amdgpu_ras_eeprom_reset_table( 667 &(amdgpu_ras_get_context(adev)->eeprom_control)); 668 669 if (!ret) { 670 /* Something was written to EEPROM. 671 */ 672 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; 673 return size; 674 } else { 675 return ret; 676 } 677 } 678 679 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { 680 .owner = THIS_MODULE, 681 .read = NULL, 682 .write = amdgpu_ras_debugfs_ctrl_write, 683 .llseek = default_llseek 684 }; 685 686 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { 687 .owner = THIS_MODULE, 688 .read = NULL, 689 .write = amdgpu_ras_debugfs_eeprom_write, 690 .llseek = default_llseek 691 }; 692 693 /** 694 * DOC: AMDGPU RAS sysfs Error Count Interface 695 * 696 * It allows the user to read the error count for each IP block on the gpu through 697 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count 698 * 699 * It outputs the multiple lines which report the uncorrected (ue) and corrected 700 * (ce) error counts. 701 * 702 * The format of one line is below, 703 * 704 * [ce|ue]: count 705 * 706 * Example: 707 * 708 * .. code-block:: bash 709 * 710 * ue: 0 711 * ce: 1 712 * 713 */ 714 static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 715 struct device_attribute *attr, char *buf) 716 { 717 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 718 struct ras_query_if info = { 719 .head = obj->head, 720 }; 721 722 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 723 return sysfs_emit(buf, "Query currently inaccessible\n"); 724 725 if (amdgpu_ras_query_error_status(obj->adev, &info)) 726 return -EINVAL; 727 728 if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 729 amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 730 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) 731 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); 732 } 733 734 if (info.head.block == AMDGPU_RAS_BLOCK__UMC) 735 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, 736 "ce", info.ce_count, "de", info.de_count); 737 else 738 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, 739 "ce", info.ce_count); 740 } 741 742 /* obj begin */ 743 744 #define get_obj(obj) do { (obj)->use++; } while (0) 745 #define alive_obj(obj) ((obj)->use) 746 747 static inline void put_obj(struct ras_manager *obj) 748 { 749 if (obj && (--obj->use == 0)) { 750 list_del(&obj->node); 751 amdgpu_ras_error_data_fini(&obj->err_data); 752 } 753 754 if (obj && (obj->use < 0)) 755 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); 756 } 757 758 /* make one obj and return it. */ 759 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 760 struct ras_common_if *head) 761 { 762 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 763 struct ras_manager *obj; 764 765 if (!adev->ras_enabled || !con) 766 return NULL; 767 768 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 769 return NULL; 770 771 if (head->block == AMDGPU_RAS_BLOCK__MCA) { 772 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 773 return NULL; 774 775 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 776 } else 777 obj = &con->objs[head->block]; 778 779 /* already exist. return obj? */ 780 if (alive_obj(obj)) 781 return NULL; 782 783 if (amdgpu_ras_error_data_init(&obj->err_data)) 784 return NULL; 785 786 obj->head = *head; 787 obj->adev = adev; 788 list_add(&obj->node, &con->head); 789 get_obj(obj); 790 791 return obj; 792 } 793 794 /* return an obj equal to head, or the first when head is NULL */ 795 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 796 struct ras_common_if *head) 797 { 798 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 799 struct ras_manager *obj; 800 int i; 801 802 if (!adev->ras_enabled || !con) 803 return NULL; 804 805 if (head) { 806 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 807 return NULL; 808 809 if (head->block == AMDGPU_RAS_BLOCK__MCA) { 810 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) 811 return NULL; 812 813 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; 814 } else 815 obj = &con->objs[head->block]; 816 817 if (alive_obj(obj)) 818 return obj; 819 } else { 820 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 821 obj = &con->objs[i]; 822 if (alive_obj(obj)) 823 return obj; 824 } 825 } 826 827 return NULL; 828 } 829 /* obj end */ 830 831 /* feature ctl begin */ 832 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 833 struct ras_common_if *head) 834 { 835 return adev->ras_hw_enabled & BIT(head->block); 836 } 837 838 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 839 struct ras_common_if *head) 840 { 841 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 842 843 return con->features & BIT(head->block); 844 } 845 846 /* 847 * if obj is not created, then create one. 848 * set feature enable flag. 849 */ 850 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 851 struct ras_common_if *head, int enable) 852 { 853 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 854 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 855 856 /* If hardware does not support ras, then do not create obj. 857 * But if hardware support ras, we can create the obj. 858 * Ras framework checks con->hw_supported to see if it need do 859 * corresponding initialization. 860 * IP checks con->support to see if it need disable ras. 861 */ 862 if (!amdgpu_ras_is_feature_allowed(adev, head)) 863 return 0; 864 865 if (enable) { 866 if (!obj) { 867 obj = amdgpu_ras_create_obj(adev, head); 868 if (!obj) 869 return -EINVAL; 870 } else { 871 /* In case we create obj somewhere else */ 872 get_obj(obj); 873 } 874 con->features |= BIT(head->block); 875 } else { 876 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 877 con->features &= ~BIT(head->block); 878 put_obj(obj); 879 } 880 } 881 882 return 0; 883 } 884 885 /* wrapper of psp_ras_enable_features */ 886 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 887 struct ras_common_if *head, bool enable) 888 { 889 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 890 union ta_ras_cmd_input *info; 891 int ret; 892 893 if (!con) 894 return -EINVAL; 895 896 /* For non-gfx ip, do not enable ras feature if it is not allowed */ 897 /* For gfx ip, regardless of feature support status, */ 898 /* Force issue enable or disable ras feature commands */ 899 if (head->block != AMDGPU_RAS_BLOCK__GFX && 900 !amdgpu_ras_is_feature_allowed(adev, head)) 901 return 0; 902 903 /* Only enable gfx ras feature from host side */ 904 if (head->block == AMDGPU_RAS_BLOCK__GFX && 905 !amdgpu_sriov_vf(adev) && 906 !amdgpu_ras_intr_triggered()) { 907 info = kzalloc_obj(union ta_ras_cmd_input); 908 if (!info) 909 return -ENOMEM; 910 911 if (!enable) { 912 info->disable_features = (struct ta_ras_disable_features_input) { 913 .block_id = amdgpu_ras_block_to_ta(head->block), 914 .error_type = amdgpu_ras_error_to_ta(head->type), 915 }; 916 } else { 917 info->enable_features = (struct ta_ras_enable_features_input) { 918 .block_id = amdgpu_ras_block_to_ta(head->block), 919 .error_type = amdgpu_ras_error_to_ta(head->type), 920 }; 921 } 922 923 ret = psp_ras_enable_features(&adev->psp, info, enable); 924 if (ret) { 925 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", 926 enable ? "enable":"disable", 927 get_ras_block_str(head), 928 amdgpu_ras_is_poison_mode_supported(adev), ret); 929 kfree(info); 930 return ret; 931 } 932 933 kfree(info); 934 } 935 936 /* setup the obj */ 937 __amdgpu_ras_feature_enable(adev, head, enable); 938 939 return 0; 940 } 941 942 /* Only used in device probe stage and called only once. */ 943 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, 944 struct ras_common_if *head, bool enable) 945 { 946 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 947 int ret; 948 949 if (!con) 950 return -EINVAL; 951 952 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 953 if (enable) { 954 /* There is no harm to issue a ras TA cmd regardless of 955 * the currecnt ras state. 956 * If current state == target state, it will do nothing 957 * But sometimes it requests driver to reset and repost 958 * with error code -EAGAIN. 959 */ 960 ret = amdgpu_ras_feature_enable(adev, head, 1); 961 /* With old ras TA, we might fail to enable ras. 962 * Log it and just setup the object. 963 * TODO need remove this WA in the future. 964 */ 965 if (ret == -EINVAL) { 966 ret = __amdgpu_ras_feature_enable(adev, head, 1); 967 if (!ret) 968 dev_info(adev->dev, 969 "RAS INFO: %s setup object\n", 970 get_ras_block_str(head)); 971 } 972 } else { 973 /* setup the object then issue a ras TA disable cmd.*/ 974 ret = __amdgpu_ras_feature_enable(adev, head, 1); 975 if (ret) 976 return ret; 977 978 /* gfx block ras disable cmd must send to ras-ta */ 979 if (head->block == AMDGPU_RAS_BLOCK__GFX) 980 con->features |= BIT(head->block); 981 982 ret = amdgpu_ras_feature_enable(adev, head, 0); 983 984 /* clean gfx block ras features flag */ 985 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) 986 con->features &= ~BIT(head->block); 987 } 988 } else 989 ret = amdgpu_ras_feature_enable(adev, head, enable); 990 991 return ret; 992 } 993 994 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 995 bool bypass) 996 { 997 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 998 struct ras_manager *obj, *tmp; 999 1000 list_for_each_entry_safe(obj, tmp, &con->head, node) { 1001 /* bypass psp. 1002 * aka just release the obj and corresponding flags 1003 */ 1004 if (bypass) { 1005 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 1006 break; 1007 } else { 1008 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 1009 break; 1010 } 1011 } 1012 1013 return con->features; 1014 } 1015 1016 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 1017 bool bypass) 1018 { 1019 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1020 int i; 1021 const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE; 1022 1023 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 1024 struct ras_common_if head = { 1025 .block = i, 1026 .type = default_ras_type, 1027 .sub_block_index = 0, 1028 }; 1029 1030 if (i == AMDGPU_RAS_BLOCK__MCA) 1031 continue; 1032 1033 if (bypass) { 1034 /* 1035 * bypass psp. vbios enable ras for us. 1036 * so just create the obj 1037 */ 1038 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 1039 break; 1040 } else { 1041 if (amdgpu_ras_feature_enable(adev, &head, 1)) 1042 break; 1043 } 1044 } 1045 1046 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { 1047 struct ras_common_if head = { 1048 .block = AMDGPU_RAS_BLOCK__MCA, 1049 .type = default_ras_type, 1050 .sub_block_index = i, 1051 }; 1052 1053 if (bypass) { 1054 /* 1055 * bypass psp. vbios enable ras for us. 1056 * so just create the obj 1057 */ 1058 if (__amdgpu_ras_feature_enable(adev, &head, 1)) 1059 break; 1060 } else { 1061 if (amdgpu_ras_feature_enable(adev, &head, 1)) 1062 break; 1063 } 1064 } 1065 1066 return con->features; 1067 } 1068 /* feature ctl end */ 1069 1070 static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, 1071 enum amdgpu_ras_block block) 1072 { 1073 if (!block_obj) 1074 return -EINVAL; 1075 1076 if (block_obj->ras_comm.block == block) 1077 return 0; 1078 1079 return -EINVAL; 1080 } 1081 1082 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, 1083 enum amdgpu_ras_block block, uint32_t sub_block_index) 1084 { 1085 struct amdgpu_ras_block_list *node, *tmp; 1086 struct amdgpu_ras_block_object *obj; 1087 1088 if (block >= AMDGPU_RAS_BLOCK__LAST) 1089 return NULL; 1090 1091 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 1092 if (!node->ras_obj) { 1093 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 1094 continue; 1095 } 1096 1097 obj = node->ras_obj; 1098 if (obj->ras_block_match) { 1099 if (obj->ras_block_match(obj, block, sub_block_index) == 0) 1100 return obj; 1101 } else { 1102 if (amdgpu_ras_block_match_default(obj, block) == 0) 1103 return obj; 1104 } 1105 } 1106 1107 return NULL; 1108 } 1109 1110 static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) 1111 { 1112 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 1113 int ret = 0; 1114 1115 /* 1116 * choosing right query method according to 1117 * whether smu support query error information 1118 */ 1119 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); 1120 if (ret == -EOPNOTSUPP) { 1121 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 1122 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) 1123 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); 1124 1125 /* umc query_ras_error_address is also responsible for clearing 1126 * error status 1127 */ 1128 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 1129 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) 1130 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); 1131 } else if (!ret) { 1132 if (adev->umc.ras && 1133 adev->umc.ras->ecc_info_query_ras_error_count) 1134 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); 1135 1136 if (adev->umc.ras && 1137 adev->umc.ras->ecc_info_query_ras_error_address) 1138 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); 1139 } 1140 } 1141 1142 static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, 1143 struct ras_manager *ras_mgr, 1144 struct ras_err_data *err_data, 1145 struct ras_query_context *qctx, 1146 const char *blk_name, 1147 bool is_ue, 1148 bool is_de) 1149 { 1150 struct amdgpu_smuio_mcm_config_info *mcm_info; 1151 struct ras_err_node *err_node; 1152 struct ras_err_info *err_info; 1153 u64 event_id = qctx->evid.event_id; 1154 1155 if (is_ue) { 1156 for_each_ras_error(err_node, err_data) { 1157 err_info = &err_node->err_info; 1158 mcm_info = &err_info->mcm_info; 1159 if (err_info->ue_count) { 1160 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1161 "%lld new uncorrectable hardware errors detected in %s block\n", 1162 mcm_info->socket_id, 1163 mcm_info->die_id, 1164 err_info->ue_count, 1165 blk_name); 1166 } 1167 } 1168 1169 for_each_ras_error(err_node, &ras_mgr->err_data) { 1170 err_info = &err_node->err_info; 1171 mcm_info = &err_info->mcm_info; 1172 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1173 "%lld uncorrectable hardware errors detected in total in %s block\n", 1174 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); 1175 } 1176 1177 } else { 1178 if (is_de) { 1179 for_each_ras_error(err_node, err_data) { 1180 err_info = &err_node->err_info; 1181 mcm_info = &err_info->mcm_info; 1182 if (err_info->de_count) { 1183 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1184 "%lld new deferred hardware errors detected in %s block\n", 1185 mcm_info->socket_id, 1186 mcm_info->die_id, 1187 err_info->de_count, 1188 blk_name); 1189 } 1190 } 1191 1192 for_each_ras_error(err_node, &ras_mgr->err_data) { 1193 err_info = &err_node->err_info; 1194 mcm_info = &err_info->mcm_info; 1195 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1196 "%lld deferred hardware errors detected in total in %s block\n", 1197 mcm_info->socket_id, mcm_info->die_id, 1198 err_info->de_count, blk_name); 1199 } 1200 } else { 1201 if (adev->debug_disable_ce_logs) 1202 return; 1203 1204 for_each_ras_error(err_node, err_data) { 1205 err_info = &err_node->err_info; 1206 mcm_info = &err_info->mcm_info; 1207 if (err_info->ce_count) { 1208 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1209 "%lld new correctable hardware errors detected in %s block\n", 1210 mcm_info->socket_id, 1211 mcm_info->die_id, 1212 err_info->ce_count, 1213 blk_name); 1214 } 1215 } 1216 1217 for_each_ras_error(err_node, &ras_mgr->err_data) { 1218 err_info = &err_node->err_info; 1219 mcm_info = &err_info->mcm_info; 1220 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " 1221 "%lld correctable hardware errors detected in total in %s block\n", 1222 mcm_info->socket_id, mcm_info->die_id, 1223 err_info->ce_count, blk_name); 1224 } 1225 } 1226 } 1227 } 1228 1229 static inline bool err_data_has_source_info(struct ras_err_data *data) 1230 { 1231 return !list_empty(&data->err_node_list); 1232 } 1233 1234 static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, 1235 struct ras_query_if *query_if, 1236 struct ras_err_data *err_data, 1237 struct ras_query_context *qctx) 1238 { 1239 struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); 1240 const char *blk_name = get_ras_block_str(&query_if->head); 1241 u64 event_id = qctx->evid.event_id; 1242 1243 if (err_data->ce_count) { 1244 if (err_data_has_source_info(err_data)) { 1245 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1246 blk_name, false, false); 1247 } else if (!adev->aid_mask && 1248 adev->smuio.funcs && 1249 adev->smuio.funcs->get_socket_id && 1250 adev->smuio.funcs->get_die_id) { 1251 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1252 "%ld correctable hardware errors " 1253 "detected in %s block\n", 1254 adev->smuio.funcs->get_socket_id(adev), 1255 adev->smuio.funcs->get_die_id(adev), 1256 ras_mgr->err_data.ce_count, 1257 blk_name); 1258 } else { 1259 RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors " 1260 "detected in %s block\n", 1261 ras_mgr->err_data.ce_count, 1262 blk_name); 1263 } 1264 } 1265 1266 if (err_data->ue_count) { 1267 if (err_data_has_source_info(err_data)) { 1268 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1269 blk_name, true, false); 1270 } else if (!adev->aid_mask && 1271 adev->smuio.funcs && 1272 adev->smuio.funcs->get_socket_id && 1273 adev->smuio.funcs->get_die_id) { 1274 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1275 "%ld uncorrectable hardware errors " 1276 "detected in %s block\n", 1277 adev->smuio.funcs->get_socket_id(adev), 1278 adev->smuio.funcs->get_die_id(adev), 1279 ras_mgr->err_data.ue_count, 1280 blk_name); 1281 } else { 1282 RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors " 1283 "detected in %s block\n", 1284 ras_mgr->err_data.ue_count, 1285 blk_name); 1286 } 1287 } 1288 1289 if (err_data->de_count) { 1290 if (err_data_has_source_info(err_data)) { 1291 amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx, 1292 blk_name, false, true); 1293 } else if (!adev->aid_mask && 1294 adev->smuio.funcs && 1295 adev->smuio.funcs->get_socket_id && 1296 adev->smuio.funcs->get_die_id) { 1297 RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " 1298 "%ld deferred hardware errors " 1299 "detected in %s block\n", 1300 adev->smuio.funcs->get_socket_id(adev), 1301 adev->smuio.funcs->get_die_id(adev), 1302 ras_mgr->err_data.de_count, 1303 blk_name); 1304 } else { 1305 RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors " 1306 "detected in %s block\n", 1307 ras_mgr->err_data.de_count, 1308 blk_name); 1309 } 1310 } 1311 } 1312 1313 static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev, 1314 struct ras_query_if *query_if, 1315 struct ras_err_data *err_data, 1316 struct ras_query_context *qctx) 1317 { 1318 unsigned long new_ue, new_ce, new_de; 1319 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); 1320 const char *blk_name = get_ras_block_str(&query_if->head); 1321 u64 event_id = qctx->evid.event_id; 1322 1323 new_ce = err_data->ce_count - obj->err_data.ce_count; 1324 new_ue = err_data->ue_count - obj->err_data.ue_count; 1325 new_de = err_data->de_count - obj->err_data.de_count; 1326 1327 if (new_ce) { 1328 RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors " 1329 "detected in %s block\n", 1330 new_ce, 1331 blk_name); 1332 } 1333 1334 if (new_ue) { 1335 RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors " 1336 "detected in %s block\n", 1337 new_ue, 1338 blk_name); 1339 } 1340 1341 if (new_de) { 1342 RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors " 1343 "detected in %s block\n", 1344 new_de, 1345 blk_name); 1346 } 1347 } 1348 1349 static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) 1350 { 1351 struct ras_err_node *err_node; 1352 struct ras_err_info *err_info; 1353 1354 if (err_data_has_source_info(err_data)) { 1355 for_each_ras_error(err_node, err_data) { 1356 err_info = &err_node->err_info; 1357 amdgpu_ras_error_statistic_de_count(&obj->err_data, 1358 &err_info->mcm_info, err_info->de_count); 1359 amdgpu_ras_error_statistic_ce_count(&obj->err_data, 1360 &err_info->mcm_info, err_info->ce_count); 1361 amdgpu_ras_error_statistic_ue_count(&obj->err_data, 1362 &err_info->mcm_info, err_info->ue_count); 1363 } 1364 } else { 1365 /* for legacy asic path which doesn't has error source info */ 1366 obj->err_data.ue_count += err_data->ue_count; 1367 obj->err_data.ce_count += err_data->ce_count; 1368 obj->err_data.de_count += err_data->de_count; 1369 } 1370 } 1371 1372 static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj, 1373 struct ras_err_data *err_data) 1374 { 1375 /* Host reports absolute counts */ 1376 obj->err_data.ue_count = err_data->ue_count; 1377 obj->err_data.ce_count = err_data->ce_count; 1378 obj->err_data.de_count = err_data->de_count; 1379 } 1380 1381 static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) 1382 { 1383 struct ras_common_if head; 1384 1385 memset(&head, 0, sizeof(head)); 1386 head.block = blk; 1387 1388 return amdgpu_ras_find_obj(adev, &head); 1389 } 1390 1391 int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 1392 const struct aca_info *aca_info, void *data) 1393 { 1394 struct ras_manager *obj; 1395 1396 /* in resume phase, no need to create aca fs node */ 1397 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) 1398 return 0; 1399 1400 obj = get_ras_manager(adev, blk); 1401 if (!obj) 1402 return -EINVAL; 1403 1404 return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); 1405 } 1406 1407 int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) 1408 { 1409 struct ras_manager *obj; 1410 1411 obj = get_ras_manager(adev, blk); 1412 if (!obj) 1413 return -EINVAL; 1414 1415 amdgpu_aca_remove_handle(&obj->aca_handle); 1416 1417 return 0; 1418 } 1419 1420 static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, 1421 enum aca_error_type type, struct ras_err_data *err_data, 1422 struct ras_query_context *qctx) 1423 { 1424 struct ras_manager *obj; 1425 1426 obj = get_ras_manager(adev, blk); 1427 if (!obj) 1428 return -EINVAL; 1429 1430 return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx); 1431 } 1432 1433 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, 1434 struct aca_handle *handle, char *buf, void *data) 1435 { 1436 struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle); 1437 struct ras_query_if info = { 1438 .head = obj->head, 1439 }; 1440 1441 if (!amdgpu_ras_get_error_query_ready(obj->adev)) 1442 return sysfs_emit(buf, "Query currently inaccessible\n"); 1443 1444 if (amdgpu_ras_query_error_status(obj->adev, &info)) 1445 return -EINVAL; 1446 1447 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, 1448 "ce", info.ce_count, "de", info.de_count); 1449 } 1450 1451 static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, 1452 struct ras_query_if *info, 1453 struct ras_err_data *err_data, 1454 struct ras_query_context *qctx, 1455 unsigned int error_query_mode) 1456 { 1457 enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; 1458 struct amdgpu_ras_block_object *block_obj = NULL; 1459 int ret; 1460 1461 if (blk == AMDGPU_RAS_BLOCK_COUNT) 1462 return -EINVAL; 1463 1464 if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY) 1465 return -EINVAL; 1466 1467 if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1468 return amdgpu_virt_req_ras_err_count(adev, blk, err_data); 1469 } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { 1470 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { 1471 amdgpu_ras_get_ecc_info(adev, err_data); 1472 } else { 1473 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); 1474 if (!block_obj || !block_obj->hw_ops) { 1475 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1476 get_ras_block_str(&info->head)); 1477 return -EINVAL; 1478 } 1479 1480 if (block_obj->hw_ops->query_ras_error_count) 1481 block_obj->hw_ops->query_ras_error_count(adev, err_data); 1482 1483 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || 1484 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || 1485 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { 1486 if (block_obj->hw_ops->query_ras_error_status) 1487 block_obj->hw_ops->query_ras_error_status(adev); 1488 } 1489 } 1490 } else { 1491 if (amdgpu_aca_is_enabled(adev)) { 1492 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx); 1493 if (ret) 1494 return ret; 1495 1496 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx); 1497 if (ret) 1498 return ret; 1499 1500 ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx); 1501 if (ret) 1502 return ret; 1503 } else { 1504 /* FIXME: add code to check return value later */ 1505 amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx); 1506 amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx); 1507 } 1508 } 1509 1510 return 0; 1511 } 1512 1513 /* query/inject/cure begin */ 1514 static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, 1515 struct ras_query_if *info, 1516 enum ras_event_type type) 1517 { 1518 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1519 struct ras_err_data err_data; 1520 struct ras_query_context qctx; 1521 unsigned int error_query_mode; 1522 int ret; 1523 1524 if (!obj) 1525 return -EINVAL; 1526 1527 ret = amdgpu_ras_error_data_init(&err_data); 1528 if (ret) 1529 return ret; 1530 1531 if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) 1532 return -EINVAL; 1533 1534 memset(&qctx, 0, sizeof(qctx)); 1535 qctx.evid.type = type; 1536 qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type); 1537 1538 if (!down_read_trylock(&adev->reset_domain->sem)) { 1539 ret = -EIO; 1540 goto out_fini_err_data; 1541 } 1542 1543 ret = amdgpu_ras_query_error_status_helper(adev, info, 1544 &err_data, 1545 &qctx, 1546 error_query_mode); 1547 up_read(&adev->reset_domain->sem); 1548 if (ret) 1549 goto out_fini_err_data; 1550 1551 if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1552 amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); 1553 amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); 1554 } else { 1555 /* Host provides absolute error counts. First generate the report 1556 * using the previous VF internal count against new host count. 1557 * Then Update VF internal count. 1558 */ 1559 amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx); 1560 amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data); 1561 } 1562 1563 info->ue_count = obj->err_data.ue_count; 1564 info->ce_count = obj->err_data.ce_count; 1565 info->de_count = obj->err_data.de_count; 1566 1567 out_fini_err_data: 1568 amdgpu_ras_error_data_fini(&err_data); 1569 1570 return ret; 1571 } 1572 1573 static int amdgpu_uniras_clear_badpages_info(struct amdgpu_device *adev) 1574 { 1575 struct ras_cmd_dev_handle req = {0}; 1576 int ret; 1577 1578 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CLEAR_BAD_PAGE_INFO, 1579 &req, sizeof(req), NULL, 0); 1580 if (ret) { 1581 dev_err(adev->dev, "Failed to clear bad pages info, ret: %d\n", ret); 1582 return ret; 1583 } 1584 1585 return 0; 1586 } 1587 1588 static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev, 1589 struct ras_query_if *info) 1590 { 1591 struct ras_cmd_block_ecc_info_req req = {0}; 1592 struct ras_cmd_block_ecc_info_rsp rsp = {0}; 1593 int ret; 1594 1595 if (!info) 1596 return -EINVAL; 1597 1598 req.block_id = info->head.block; 1599 req.subblock_id = info->head.sub_block_index; 1600 1601 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS, 1602 &req, sizeof(req), &rsp, sizeof(rsp)); 1603 if (!ret) { 1604 info->ce_count = rsp.ce_count; 1605 info->ue_count = rsp.ue_count; 1606 info->de_count = rsp.de_count; 1607 } 1608 1609 return ret; 1610 } 1611 1612 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) 1613 { 1614 if (amdgpu_uniras_enabled(adev)) 1615 return amdgpu_uniras_query_block_ecc(adev, info); 1616 else 1617 return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); 1618 } 1619 1620 int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, 1621 enum amdgpu_ras_block block) 1622 { 1623 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 1624 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 1625 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 1626 1627 if (!block_obj || !block_obj->hw_ops) { 1628 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1629 ras_block_str(block)); 1630 return -EOPNOTSUPP; 1631 } 1632 1633 if (!amdgpu_ras_is_supported(adev, block) || 1634 !amdgpu_ras_get_aca_debug_mode(adev)) 1635 return -EOPNOTSUPP; 1636 1637 if (amdgpu_sriov_vf(adev)) 1638 return -EOPNOTSUPP; 1639 1640 /* skip ras error reset in gpu reset */ 1641 if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && 1642 ((smu_funcs && smu_funcs->set_debug_mode) || 1643 (mca_funcs && mca_funcs->mca_set_debug_mode))) 1644 return -EOPNOTSUPP; 1645 1646 if (block_obj->hw_ops->reset_ras_error_count) 1647 block_obj->hw_ops->reset_ras_error_count(adev); 1648 1649 return 0; 1650 } 1651 1652 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, 1653 enum amdgpu_ras_block block) 1654 { 1655 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); 1656 1657 if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) 1658 return 0; 1659 1660 if ((block == AMDGPU_RAS_BLOCK__GFX) || 1661 (block == AMDGPU_RAS_BLOCK__MMHUB)) { 1662 if (block_obj->hw_ops->reset_ras_error_status) 1663 block_obj->hw_ops->reset_ras_error_status(adev); 1664 } 1665 1666 return 0; 1667 } 1668 1669 static int amdgpu_uniras_error_inject(struct amdgpu_device *adev, 1670 struct ras_inject_if *info) 1671 { 1672 struct ras_cmd_inject_error_req inject_req; 1673 struct ras_cmd_inject_error_rsp rsp; 1674 1675 if (!info) 1676 return -EINVAL; 1677 1678 memset(&inject_req, 0, sizeof(inject_req)); 1679 inject_req.block_id = info->head.block; 1680 inject_req.subblock_id = info->head.sub_block_index; 1681 inject_req.address = info->address; 1682 inject_req.error_type = info->head.type; 1683 inject_req.instance_mask = info->instance_mask; 1684 inject_req.method = info->value; 1685 1686 return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR, 1687 &inject_req, sizeof(inject_req), &rsp, sizeof(rsp)); 1688 } 1689 1690 /* wrapper of psp_ras_trigger_error */ 1691 int amdgpu_ras_error_inject(struct amdgpu_device *adev, 1692 struct ras_inject_if *info) 1693 { 1694 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 1695 struct ta_ras_trigger_error_input block_info = { 1696 .block_id = amdgpu_ras_block_to_ta(info->head.block), 1697 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), 1698 .sub_block_index = info->head.sub_block_index, 1699 .address = info->address, 1700 .value = info->value, 1701 }; 1702 int ret = -EINVAL; 1703 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, 1704 info->head.block, 1705 info->head.sub_block_index); 1706 1707 if (amdgpu_uniras_enabled(adev)) 1708 return amdgpu_uniras_error_inject(adev, info); 1709 1710 /* inject on guest isn't allowed, return success directly */ 1711 if (amdgpu_sriov_vf(adev)) 1712 return 0; 1713 1714 if (!obj) 1715 return -EINVAL; 1716 1717 if (!block_obj || !block_obj->hw_ops) { 1718 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 1719 get_ras_block_str(&info->head)); 1720 return -EINVAL; 1721 } 1722 1723 /* Calculate XGMI relative offset */ 1724 if (adev->gmc.xgmi.num_physical_nodes > 1 && 1725 info->head.block != AMDGPU_RAS_BLOCK__GFX) { 1726 block_info.address = 1727 amdgpu_xgmi_get_relative_phy_addr(adev, 1728 block_info.address); 1729 } 1730 1731 if (block_obj->hw_ops->ras_error_inject) { 1732 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) 1733 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); 1734 else /* Special ras_error_inject is defined (e.g: xgmi) */ 1735 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, 1736 info->instance_mask); 1737 } else { 1738 /* default path */ 1739 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); 1740 } 1741 1742 if (ret) 1743 dev_err(adev->dev, "ras inject %s failed %d\n", 1744 get_ras_block_str(&info->head), ret); 1745 1746 return ret; 1747 } 1748 1749 /** 1750 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP 1751 * @adev: pointer to AMD GPU device 1752 * @ce_count: pointer to an integer to be set to the count of correctible errors. 1753 * @ue_count: pointer to an integer to be set to the count of uncorrectible errors. 1754 * @query_info: pointer to ras_query_if 1755 * 1756 * Return 0 for query success or do nothing, otherwise return an error 1757 * on failures 1758 */ 1759 static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, 1760 unsigned long *ce_count, 1761 unsigned long *ue_count, 1762 struct ras_query_if *query_info) 1763 { 1764 int ret; 1765 1766 if (!query_info) 1767 /* do nothing if query_info is not specified */ 1768 return 0; 1769 1770 ret = amdgpu_ras_query_error_status(adev, query_info); 1771 if (ret) 1772 return ret; 1773 1774 *ce_count += query_info->ce_count; 1775 *ue_count += query_info->ue_count; 1776 1777 /* some hardware/IP supports read to clear 1778 * no need to explictly reset the err status after the query call */ 1779 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && 1780 amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) { 1781 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) 1782 dev_warn(adev->dev, 1783 "Failed to reset error counter and error status\n"); 1784 } 1785 1786 return 0; 1787 } 1788 1789 /** 1790 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP 1791 * @adev: pointer to AMD GPU device 1792 * @ce_count: pointer to an integer to be set to the count of correctible errors. 1793 * @ue_count: pointer to an integer to be set to the count of uncorrectible 1794 * errors. 1795 * @query_info: pointer to ras_query_if if the query request is only for 1796 * specific ip block; if info is NULL, then the qurey request is for 1797 * all the ip blocks that support query ras error counters/status 1798 * 1799 * If set, @ce_count or @ue_count, count and return the corresponding 1800 * error counts in those integer pointers. Return 0 if the device 1801 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS. 1802 */ 1803 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 1804 unsigned long *ce_count, 1805 unsigned long *ue_count, 1806 struct ras_query_if *query_info) 1807 { 1808 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1809 struct ras_manager *obj; 1810 unsigned long ce, ue; 1811 int ret; 1812 1813 if (!adev->ras_enabled || !con) 1814 return -EOPNOTSUPP; 1815 1816 /* Don't count since no reporting. 1817 */ 1818 if (!ce_count && !ue_count) 1819 return 0; 1820 1821 ce = 0; 1822 ue = 0; 1823 if (!query_info) { 1824 /* query all the ip blocks that support ras query interface */ 1825 list_for_each_entry(obj, &con->head, node) { 1826 struct ras_query_if info = { 1827 .head = obj->head, 1828 }; 1829 1830 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); 1831 } 1832 } else { 1833 /* query specific ip block */ 1834 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); 1835 } 1836 1837 if (ret) 1838 return ret; 1839 1840 if (ce_count) 1841 *ce_count = ce; 1842 1843 if (ue_count) 1844 *ue_count = ue; 1845 1846 return 0; 1847 } 1848 /* query/inject/cure end */ 1849 1850 1851 /* sysfs begin */ 1852 1853 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 1854 struct ras_badpage *bps, uint32_t count, uint32_t start); 1855 static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, 1856 struct ras_badpage *bps, uint32_t count, uint32_t start); 1857 1858 static char *amdgpu_ras_badpage_flags_str(unsigned int flags) 1859 { 1860 switch (flags) { 1861 case AMDGPU_RAS_RETIRE_PAGE_RESERVED: 1862 return "R"; 1863 case AMDGPU_RAS_RETIRE_PAGE_PENDING: 1864 return "P"; 1865 case AMDGPU_RAS_RETIRE_PAGE_FAULT: 1866 default: 1867 return "F"; 1868 } 1869 } 1870 1871 /** 1872 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface 1873 * 1874 * It allows user to read the bad pages of vram on the gpu through 1875 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages 1876 * 1877 * It outputs multiple lines, and each line stands for one gpu page. 1878 * 1879 * The format of one line is below, 1880 * gpu pfn : gpu page size : flags 1881 * 1882 * gpu pfn and gpu page size are printed in hex format. 1883 * flags can be one of below character, 1884 * 1885 * R: reserved, this gpu page is reserved and not able to use. 1886 * 1887 * P: pending for reserve, this gpu page is marked as bad, will be reserved 1888 * in next window of page_reserve. 1889 * 1890 * F: unable to reserve. this gpu page can't be reserved due to some reasons. 1891 * 1892 * Examples: 1893 * 1894 * .. code-block:: bash 1895 * 1896 * 0x00000001 : 0x00001000 : R 1897 * 0x00000002 : 0x00001000 : P 1898 * 1899 */ 1900 1901 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, 1902 struct kobject *kobj, const struct bin_attribute *attr, 1903 char *buf, loff_t ppos, size_t count) 1904 { 1905 struct amdgpu_ras *con = 1906 container_of(attr, struct amdgpu_ras, badpages_attr); 1907 struct amdgpu_device *adev = con->adev; 1908 const unsigned int element_size = 1909 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; 1910 unsigned int start = div64_ul(ppos + element_size - 1, element_size); 1911 unsigned int end = div64_ul(ppos + count - 1, element_size); 1912 ssize_t s = 0; 1913 struct ras_badpage *bps = NULL; 1914 int bps_count = 0, i, status; 1915 uint64_t address; 1916 1917 memset(buf, 0, count); 1918 1919 bps_count = end - start; 1920 bps = kmalloc_objs(*bps, bps_count); 1921 if (!bps) 1922 return 0; 1923 1924 memset(bps, 0, sizeof(*bps) * bps_count); 1925 1926 if (amdgpu_uniras_enabled(adev)) 1927 bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start); 1928 else 1929 bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start); 1930 1931 if (bps_count <= 0) { 1932 kfree(bps); 1933 return 0; 1934 } 1935 1936 for (i = 0; i < bps_count; i++) { 1937 address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT; 1938 1939 bps[i].size = AMDGPU_GPU_PAGE_SIZE; 1940 1941 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, 1942 address); 1943 if (status == -EBUSY) 1944 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; 1945 else if (status == -ENOENT) 1946 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; 1947 else 1948 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; 1949 1950 if ((bps[i].flags != AMDGPU_RAS_RETIRE_PAGE_RESERVED) && 1951 amdgpu_ras_check_critical_address(adev, address)) 1952 bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED; 1953 1954 s += scnprintf(&buf[s], element_size + 1, 1955 "0x%08x : 0x%08x : %1s\n", 1956 bps[i].bp, 1957 bps[i].size, 1958 amdgpu_ras_badpage_flags_str(bps[i].flags)); 1959 } 1960 1961 kfree(bps); 1962 1963 return s; 1964 } 1965 1966 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 1967 struct device_attribute *attr, char *buf) 1968 { 1969 struct amdgpu_ras *con = 1970 container_of(attr, struct amdgpu_ras, features_attr); 1971 1972 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); 1973 } 1974 1975 static bool amdgpu_ras_get_version_info(struct amdgpu_device *adev, u32 *major, 1976 u32 *minor, u32 *rev) 1977 { 1978 int i; 1979 1980 if (!adev || !major || !minor || !rev || !amdgpu_uniras_enabled(adev)) 1981 return false; 1982 1983 for (i = 0; i < adev->num_ip_blocks; i++) { 1984 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_RAS) { 1985 *major = adev->ip_blocks[i].version->major; 1986 *minor = adev->ip_blocks[i].version->minor; 1987 *rev = adev->ip_blocks[i].version->rev; 1988 return true; 1989 } 1990 } 1991 1992 return false; 1993 } 1994 1995 static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev, 1996 struct device_attribute *attr, char *buf) 1997 { 1998 struct amdgpu_ras *con = 1999 container_of(attr, struct amdgpu_ras, version_attr); 2000 u32 major, minor, rev; 2001 ssize_t size = 0; 2002 2003 size += sysfs_emit_at(buf, size, "table version: 0x%x\n", 2004 con->eeprom_control.tbl_hdr.version); 2005 2006 if (amdgpu_ras_get_version_info(con->adev, &major, &minor, &rev)) 2007 size += sysfs_emit_at(buf, size, "ras version: %u.%u.%u\n", 2008 major, minor, rev); 2009 2010 return size; 2011 } 2012 2013 static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, 2014 struct device_attribute *attr, char *buf) 2015 { 2016 struct amdgpu_ras *con = 2017 container_of(attr, struct amdgpu_ras, schema_attr); 2018 return sysfs_emit(buf, "schema: 0x%x\n", con->schema); 2019 } 2020 2021 static struct { 2022 enum ras_event_type type; 2023 const char *name; 2024 } dump_event[] = { 2025 {RAS_EVENT_TYPE_FATAL, "Fatal Error"}, 2026 {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"}, 2027 {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, 2028 }; 2029 2030 static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev, 2031 struct device_attribute *attr, char *buf) 2032 { 2033 struct amdgpu_ras *con = 2034 container_of(attr, struct amdgpu_ras, event_state_attr); 2035 struct ras_event_manager *event_mgr = con->event_mgr; 2036 struct ras_event_state *event_state; 2037 int i, size = 0; 2038 2039 if (!event_mgr) 2040 return -EINVAL; 2041 2042 size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); 2043 for (i = 0; i < ARRAY_SIZE(dump_event); i++) { 2044 event_state = &event_mgr->event_state[dump_event[i].type]; 2045 size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n", 2046 dump_event[i].name, 2047 atomic64_read(&event_state->count), 2048 event_state->last_seqno); 2049 } 2050 2051 return (ssize_t)size; 2052 } 2053 2054 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) 2055 { 2056 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2057 2058 if (adev->dev->kobj.sd) 2059 sysfs_remove_file_from_group(&adev->dev->kobj, 2060 &con->badpages_attr.attr, 2061 RAS_FS_NAME); 2062 } 2063 2064 static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev) 2065 { 2066 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2067 struct attribute *attrs[] = { 2068 &con->features_attr.attr, 2069 &con->version_attr.attr, 2070 &con->schema_attr.attr, 2071 &con->event_state_attr.attr, 2072 NULL 2073 }; 2074 struct attribute_group group = { 2075 .name = RAS_FS_NAME, 2076 .attrs = attrs, 2077 }; 2078 2079 if (adev->dev->kobj.sd) 2080 sysfs_remove_group(&adev->dev->kobj, &group); 2081 2082 return 0; 2083 } 2084 2085 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 2086 struct ras_common_if *head) 2087 { 2088 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2089 2090 if (amdgpu_aca_is_enabled(adev)) 2091 return 0; 2092 2093 if (!obj || obj->attr_inuse) 2094 return -EINVAL; 2095 2096 if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block)) 2097 return 0; 2098 2099 get_obj(obj); 2100 2101 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), 2102 "%s_err_count", head->name); 2103 2104 obj->sysfs_attr = (struct device_attribute){ 2105 .attr = { 2106 .name = obj->fs_data.sysfs_name, 2107 .mode = S_IRUGO, 2108 }, 2109 .show = amdgpu_ras_sysfs_read, 2110 }; 2111 sysfs_attr_init(&obj->sysfs_attr.attr); 2112 2113 if (sysfs_add_file_to_group(&adev->dev->kobj, 2114 &obj->sysfs_attr.attr, 2115 RAS_FS_NAME)) { 2116 put_obj(obj); 2117 return -EINVAL; 2118 } 2119 2120 obj->attr_inuse = 1; 2121 2122 return 0; 2123 } 2124 2125 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 2126 struct ras_common_if *head) 2127 { 2128 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2129 2130 if (amdgpu_aca_is_enabled(adev)) 2131 return 0; 2132 2133 if (!obj || !obj->attr_inuse) 2134 return -EINVAL; 2135 2136 if (adev->dev->kobj.sd) 2137 sysfs_remove_file_from_group(&adev->dev->kobj, 2138 &obj->sysfs_attr.attr, 2139 RAS_FS_NAME); 2140 obj->attr_inuse = 0; 2141 put_obj(obj); 2142 2143 return 0; 2144 } 2145 2146 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 2147 { 2148 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2149 struct ras_manager *obj, *tmp; 2150 2151 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2152 amdgpu_ras_sysfs_remove(adev, &obj->head); 2153 } 2154 2155 if (amdgpu_bad_page_threshold != 0) 2156 amdgpu_ras_sysfs_remove_bad_page_node(adev); 2157 2158 amdgpu_ras_sysfs_remove_dev_attr_node(adev); 2159 2160 return 0; 2161 } 2162 /* sysfs end */ 2163 2164 /** 2165 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors 2166 * 2167 * Normally when there is an uncorrectable error, the driver will reset 2168 * the GPU to recover. However, in the event of an unrecoverable error, 2169 * the driver provides an interface to reboot the system automatically 2170 * in that event. 2171 * 2172 * The following file in debugfs provides that interface: 2173 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot 2174 * 2175 * Usage: 2176 * 2177 * .. code-block:: bash 2178 * 2179 * echo true > .../ras/auto_reboot 2180 * 2181 */ 2182 /* debugfs begin */ 2183 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) 2184 { 2185 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2186 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; 2187 struct drm_minor *minor = adev_to_drm(adev)->primary; 2188 struct dentry *dir; 2189 2190 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); 2191 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, 2192 &amdgpu_ras_debugfs_ctrl_ops); 2193 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, 2194 &amdgpu_ras_debugfs_eeprom_ops); 2195 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, 2196 &con->bad_page_cnt_threshold); 2197 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); 2198 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); 2199 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); 2200 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, 2201 &amdgpu_ras_debugfs_eeprom_size_ops); 2202 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", 2203 S_IRUGO, dir, adev, 2204 &amdgpu_ras_debugfs_eeprom_table_ops); 2205 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); 2206 2207 /* 2208 * After one uncorrectable error happens, usually GPU recovery will 2209 * be scheduled. But due to the known problem in GPU recovery failing 2210 * to bring GPU back, below interface provides one direct way to 2211 * user to reboot system automatically in such case within 2212 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine 2213 * will never be called. 2214 */ 2215 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); 2216 2217 /* 2218 * User could set this not to clean up hardware's error count register 2219 * of RAS IPs during ras recovery. 2220 */ 2221 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, 2222 &con->disable_ras_err_cnt_harvest); 2223 return dir; 2224 } 2225 2226 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 2227 struct ras_fs_if *head, 2228 struct dentry *dir) 2229 { 2230 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 2231 2232 if (!obj || !dir) 2233 return; 2234 2235 get_obj(obj); 2236 2237 memcpy(obj->fs_data.debugfs_name, 2238 head->debugfs_name, 2239 sizeof(obj->fs_data.debugfs_name)); 2240 2241 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, 2242 obj, &amdgpu_ras_debugfs_ops); 2243 } 2244 2245 static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) 2246 { 2247 bool ret; 2248 2249 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 2250 case IP_VERSION(13, 0, 6): 2251 case IP_VERSION(13, 0, 12): 2252 case IP_VERSION(13, 0, 14): 2253 ret = true; 2254 break; 2255 default: 2256 ret = false; 2257 break; 2258 } 2259 2260 return ret; 2261 } 2262 2263 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) 2264 { 2265 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2266 struct dentry *dir; 2267 struct ras_manager *obj; 2268 struct ras_fs_if fs_info; 2269 2270 /* 2271 * it won't be called in resume path, no need to check 2272 * suspend and gpu reset status 2273 */ 2274 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) 2275 return; 2276 2277 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); 2278 2279 list_for_each_entry(obj, &con->head, node) { 2280 if (amdgpu_ras_is_supported(adev, obj->head.block) && 2281 (obj->attr_inuse == 1)) { 2282 sprintf(fs_info.debugfs_name, "%s_err_inject", 2283 get_ras_block_str(&obj->head)); 2284 fs_info.head = obj->head; 2285 amdgpu_ras_debugfs_create(adev, &fs_info, dir); 2286 } 2287 } 2288 2289 if (amdgpu_ras_aca_is_supported(adev)) { 2290 if (amdgpu_aca_is_enabled(adev)) 2291 amdgpu_aca_smu_debugfs_init(adev, dir); 2292 else 2293 amdgpu_mca_smu_debugfs_init(adev, dir); 2294 } 2295 } 2296 2297 /* debugfs end */ 2298 2299 /* ras fs */ 2300 static const BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, 2301 amdgpu_ras_sysfs_badpages_read, NULL, 0); 2302 static DEVICE_ATTR(features, S_IRUGO, 2303 amdgpu_ras_sysfs_features_read, NULL); 2304 static DEVICE_ATTR(version, 0444, 2305 amdgpu_ras_sysfs_version_show, NULL); 2306 static DEVICE_ATTR(schema, 0444, 2307 amdgpu_ras_sysfs_schema_show, NULL); 2308 static DEVICE_ATTR(event_state, 0444, 2309 amdgpu_ras_sysfs_event_state_show, NULL); 2310 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 2311 { 2312 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2313 struct attribute_group group = { 2314 .name = RAS_FS_NAME, 2315 }; 2316 struct attribute *attrs[] = { 2317 &con->features_attr.attr, 2318 &con->version_attr.attr, 2319 &con->schema_attr.attr, 2320 &con->event_state_attr.attr, 2321 NULL 2322 }; 2323 const struct bin_attribute *bin_attrs[] = { 2324 NULL, 2325 NULL, 2326 }; 2327 int r; 2328 2329 group.attrs = attrs; 2330 2331 /* add features entry */ 2332 con->features_attr = dev_attr_features; 2333 sysfs_attr_init(attrs[0]); 2334 2335 /* add version entry */ 2336 con->version_attr = dev_attr_version; 2337 sysfs_attr_init(attrs[1]); 2338 2339 /* add schema entry */ 2340 con->schema_attr = dev_attr_schema; 2341 sysfs_attr_init(attrs[2]); 2342 2343 /* add event_state entry */ 2344 con->event_state_attr = dev_attr_event_state; 2345 sysfs_attr_init(attrs[3]); 2346 2347 if (amdgpu_bad_page_threshold != 0) { 2348 /* add bad_page_features entry */ 2349 con->badpages_attr = bin_attr_gpu_vram_bad_pages; 2350 sysfs_bin_attr_init(&con->badpages_attr); 2351 bin_attrs[0] = &con->badpages_attr; 2352 group.bin_attrs = bin_attrs; 2353 } 2354 2355 r = sysfs_create_group(&adev->dev->kobj, &group); 2356 if (r) 2357 dev_err(adev->dev, "Failed to create RAS sysfs group!"); 2358 2359 return 0; 2360 } 2361 2362 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 2363 { 2364 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2365 struct ras_manager *con_obj, *ip_obj, *tmp; 2366 2367 if (IS_ENABLED(CONFIG_DEBUG_FS)) { 2368 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { 2369 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); 2370 if (ip_obj) 2371 put_obj(ip_obj); 2372 } 2373 } 2374 2375 amdgpu_ras_sysfs_remove_all(adev); 2376 return 0; 2377 } 2378 /* ras fs end */ 2379 2380 /* ih begin */ 2381 2382 /* For the hardware that cannot enable bif ring for both ras_controller_irq 2383 * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status 2384 * register to check whether the interrupt is triggered or not, and properly 2385 * ack the interrupt if it is there 2386 */ 2387 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) 2388 { 2389 /* Fatal error events are handled on host side */ 2390 if (amdgpu_sriov_vf(adev)) 2391 return; 2392 /* 2393 * If the current interrupt is caused by a non-fatal RAS error, skip 2394 * check for fatal error. For fatal errors, FED status of all devices 2395 * in XGMI hive gets set when the first device gets fatal error 2396 * interrupt. The error gets propagated to other devices as well, so 2397 * make sure to ack the interrupt regardless of FED status. 2398 */ 2399 if (!amdgpu_ras_get_fed_status(adev) && 2400 amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) 2401 return; 2402 2403 if (amdgpu_uniras_enabled(adev)) { 2404 amdgpu_ras_mgr_handle_fatal_interrupt(adev, NULL); 2405 return; 2406 } 2407 2408 if (adev->nbio.ras && 2409 adev->nbio.ras->handle_ras_controller_intr_no_bifring) 2410 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); 2411 2412 if (adev->nbio.ras && 2413 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) 2414 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); 2415 } 2416 2417 static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, 2418 struct amdgpu_iv_entry *entry) 2419 { 2420 bool poison_stat = false; 2421 struct amdgpu_device *adev = obj->adev; 2422 struct amdgpu_ras_block_object *block_obj = 2423 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); 2424 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2425 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; 2426 u64 event_id; 2427 int ret; 2428 2429 if (!block_obj || !con) 2430 return; 2431 2432 ret = amdgpu_ras_mark_ras_event(adev, type); 2433 if (ret) 2434 return; 2435 2436 amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); 2437 /* both query_poison_status and handle_poison_consumption are optional, 2438 * but at least one of them should be implemented if we need poison 2439 * consumption handler 2440 */ 2441 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { 2442 poison_stat = block_obj->hw_ops->query_poison_status(adev); 2443 if (!poison_stat) { 2444 /* Not poison consumption interrupt, no need to handle it */ 2445 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", 2446 block_obj->ras_comm.name); 2447 2448 return; 2449 } 2450 } 2451 2452 amdgpu_umc_poison_handler(adev, obj->head.block, 0); 2453 2454 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) 2455 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); 2456 2457 /* gpu reset is fallback for failed and default cases. 2458 * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. 2459 */ 2460 if (poison_stat && !amdgpu_ras_is_rma(adev)) { 2461 event_id = amdgpu_ras_acquire_event_id(adev, type); 2462 RAS_EVENT_LOG(adev, event_id, 2463 "GPU reset for %s RAS poison consumption is issued!\n", 2464 block_obj->ras_comm.name); 2465 amdgpu_ras_reset_gpu(adev); 2466 } 2467 2468 if (!poison_stat) 2469 amdgpu_gfx_poison_consumption_handler(adev, entry); 2470 } 2471 2472 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, 2473 struct amdgpu_iv_entry *entry) 2474 { 2475 struct amdgpu_device *adev = obj->adev; 2476 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 2477 u64 event_id; 2478 int ret; 2479 2480 ret = amdgpu_ras_mark_ras_event(adev, type); 2481 if (ret) 2482 return; 2483 2484 event_id = amdgpu_ras_acquire_event_id(adev, type); 2485 RAS_EVENT_LOG(adev, event_id, "Poison is created\n"); 2486 2487 if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { 2488 struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); 2489 2490 atomic_inc(&con->page_retirement_req_cnt); 2491 atomic_inc(&con->poison_creation_count); 2492 2493 wake_up(&con->page_retirement_wq); 2494 } 2495 } 2496 2497 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, 2498 struct amdgpu_iv_entry *entry) 2499 { 2500 struct ras_ih_data *data = &obj->ih_data; 2501 struct ras_err_data err_data; 2502 int ret; 2503 2504 if (!data->cb) 2505 return; 2506 2507 ret = amdgpu_ras_error_data_init(&err_data); 2508 if (ret) 2509 return; 2510 2511 /* Let IP handle its data, maybe we need get the output 2512 * from the callback to update the error type/count, etc 2513 */ 2514 amdgpu_ras_set_fed(obj->adev, true); 2515 ret = data->cb(obj->adev, &err_data, entry); 2516 /* ue will trigger an interrupt, and in that case 2517 * we need do a reset to recovery the whole system. 2518 * But leave IP do that recovery, here we just dispatch 2519 * the error. 2520 */ 2521 if (ret == AMDGPU_RAS_SUCCESS) { 2522 /* these counts could be left as 0 if 2523 * some blocks do not count error number 2524 */ 2525 obj->err_data.ue_count += err_data.ue_count; 2526 obj->err_data.ce_count += err_data.ce_count; 2527 obj->err_data.de_count += err_data.de_count; 2528 } 2529 2530 amdgpu_ras_error_data_fini(&err_data); 2531 } 2532 2533 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 2534 { 2535 struct ras_ih_data *data = &obj->ih_data; 2536 struct amdgpu_iv_entry entry; 2537 2538 while (data->rptr != data->wptr) { 2539 rmb(); 2540 memcpy(&entry, &data->ring[data->rptr], 2541 data->element_size); 2542 2543 wmb(); 2544 data->rptr = (data->aligned_element_size + 2545 data->rptr) % data->ring_size; 2546 2547 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { 2548 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 2549 amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); 2550 else 2551 amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); 2552 } else { 2553 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) 2554 amdgpu_ras_interrupt_umc_handler(obj, &entry); 2555 else 2556 dev_warn(obj->adev->dev, 2557 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); 2558 } 2559 } 2560 } 2561 2562 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 2563 { 2564 struct ras_ih_data *data = 2565 container_of(work, struct ras_ih_data, ih_work); 2566 struct ras_manager *obj = 2567 container_of(data, struct ras_manager, ih_data); 2568 2569 amdgpu_ras_interrupt_handler(obj); 2570 } 2571 2572 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 2573 struct ras_dispatch_if *info) 2574 { 2575 struct ras_manager *obj; 2576 struct ras_ih_data *data; 2577 2578 if (amdgpu_uniras_enabled(adev)) { 2579 struct ras_ih_info ih_info; 2580 2581 memset(&ih_info, 0, sizeof(ih_info)); 2582 ih_info.block = info->head.block; 2583 memcpy(&ih_info.iv_entry, info->entry, sizeof(struct amdgpu_iv_entry)); 2584 2585 return amdgpu_ras_mgr_handle_controller_interrupt(adev, &ih_info); 2586 } 2587 2588 obj = amdgpu_ras_find_obj(adev, &info->head); 2589 if (!obj) 2590 return -EINVAL; 2591 2592 data = &obj->ih_data; 2593 2594 if (data->inuse == 0) 2595 return 0; 2596 2597 /* Might be overflow... */ 2598 memcpy(&data->ring[data->wptr], info->entry, 2599 data->element_size); 2600 2601 wmb(); 2602 data->wptr = (data->aligned_element_size + 2603 data->wptr) % data->ring_size; 2604 2605 schedule_work(&data->ih_work); 2606 2607 return 0; 2608 } 2609 2610 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 2611 struct ras_common_if *head) 2612 { 2613 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2614 struct ras_ih_data *data; 2615 2616 if (!obj) 2617 return -EINVAL; 2618 2619 data = &obj->ih_data; 2620 if (data->inuse == 0) 2621 return 0; 2622 2623 cancel_work_sync(&data->ih_work); 2624 2625 kfree(data->ring); 2626 memset(data, 0, sizeof(*data)); 2627 put_obj(obj); 2628 2629 return 0; 2630 } 2631 2632 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 2633 struct ras_common_if *head) 2634 { 2635 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 2636 struct ras_ih_data *data; 2637 struct amdgpu_ras_block_object *ras_obj; 2638 2639 if (!obj) { 2640 /* in case we registe the IH before enable ras feature */ 2641 obj = amdgpu_ras_create_obj(adev, head); 2642 if (!obj) 2643 return -EINVAL; 2644 } else 2645 get_obj(obj); 2646 2647 ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); 2648 2649 data = &obj->ih_data; 2650 /* add the callback.etc */ 2651 *data = (struct ras_ih_data) { 2652 .inuse = 0, 2653 .cb = ras_obj->ras_cb, 2654 .element_size = sizeof(struct amdgpu_iv_entry), 2655 .rptr = 0, 2656 .wptr = 0, 2657 }; 2658 2659 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 2660 2661 data->aligned_element_size = ALIGN(data->element_size, 8); 2662 /* the ring can store 64 iv entries. */ 2663 data->ring_size = 64 * data->aligned_element_size; 2664 data->ring = kmalloc(data->ring_size, GFP_KERNEL); 2665 if (!data->ring) { 2666 put_obj(obj); 2667 return -ENOMEM; 2668 } 2669 2670 /* IH is ready */ 2671 data->inuse = 1; 2672 2673 return 0; 2674 } 2675 2676 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 2677 { 2678 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2679 struct ras_manager *obj, *tmp; 2680 2681 list_for_each_entry_safe(obj, tmp, &con->head, node) { 2682 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); 2683 } 2684 2685 return 0; 2686 } 2687 /* ih end */ 2688 2689 /* traversal all IPs except NBIO to query error counter */ 2690 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type) 2691 { 2692 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2693 struct ras_manager *obj; 2694 2695 if (!adev->ras_enabled || !con) 2696 return; 2697 2698 list_for_each_entry(obj, &con->head, node) { 2699 struct ras_query_if info = { 2700 .head = obj->head, 2701 }; 2702 2703 /* 2704 * PCIE_BIF IP has one different isr by ras controller 2705 * interrupt, the specific ras counter query will be 2706 * done in that isr. So skip such block from common 2707 * sync flood interrupt isr calling. 2708 */ 2709 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) 2710 continue; 2711 2712 /* 2713 * this is a workaround for aldebaran, skip send msg to 2714 * smu to get ecc_info table due to smu handle get ecc 2715 * info table failed temporarily. 2716 * should be removed until smu fix handle ecc_info table. 2717 */ 2718 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && 2719 (amdgpu_ip_version(adev, MP1_HWIP, 0) == 2720 IP_VERSION(13, 0, 2))) 2721 continue; 2722 2723 amdgpu_ras_query_error_status_with_event(adev, &info, type); 2724 2725 if (amdgpu_ip_version(adev, MP0_HWIP, 0) != 2726 IP_VERSION(11, 0, 2) && 2727 amdgpu_ip_version(adev, MP0_HWIP, 0) != 2728 IP_VERSION(11, 0, 4) && 2729 amdgpu_ip_version(adev, MP0_HWIP, 0) != 2730 IP_VERSION(13, 0, 0)) { 2731 if (amdgpu_ras_reset_error_status(adev, info.head.block)) 2732 dev_warn(adev->dev, "Failed to reset error counter and error status"); 2733 } 2734 } 2735 } 2736 2737 /* Parse RdRspStatus and WrRspStatus */ 2738 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, 2739 struct ras_query_if *info) 2740 { 2741 struct amdgpu_ras_block_object *block_obj; 2742 /* 2743 * Only two block need to query read/write 2744 * RspStatus at current state 2745 */ 2746 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && 2747 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) 2748 return; 2749 2750 block_obj = amdgpu_ras_get_ras_block(adev, 2751 info->head.block, 2752 info->head.sub_block_index); 2753 2754 if (!block_obj || !block_obj->hw_ops) { 2755 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", 2756 get_ras_block_str(&info->head)); 2757 return; 2758 } 2759 2760 if (block_obj->hw_ops->query_ras_error_status) 2761 block_obj->hw_ops->query_ras_error_status(adev); 2762 2763 } 2764 2765 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) 2766 { 2767 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2768 struct ras_manager *obj; 2769 2770 if (!adev->ras_enabled || !con) 2771 return; 2772 2773 list_for_each_entry(obj, &con->head, node) { 2774 struct ras_query_if info = { 2775 .head = obj->head, 2776 }; 2777 2778 amdgpu_ras_error_status_query(adev, &info); 2779 } 2780 } 2781 2782 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, 2783 struct ras_badpage *bps, uint32_t count, uint32_t start) 2784 { 2785 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2786 struct ras_err_handler_data *data; 2787 int r = 0; 2788 uint32_t i; 2789 2790 if (!con || !con->eh_data || !bps || !count) 2791 return -EINVAL; 2792 2793 mutex_lock(&con->recovery_lock); 2794 data = con->eh_data; 2795 if (start < data->count) { 2796 for (i = start; i < data->count; i++) { 2797 if (!data->bps[i].ts) 2798 continue; 2799 2800 /* U64_MAX is used to mark the record as invalid */ 2801 if (data->bps[i].retired_page == U64_MAX) 2802 continue; 2803 2804 bps[r].bp = data->bps[i].retired_page; 2805 r++; 2806 if (r >= count) 2807 break; 2808 } 2809 } 2810 mutex_unlock(&con->recovery_lock); 2811 2812 return r; 2813 } 2814 2815 static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev, 2816 struct ras_badpage *bps, uint32_t count, uint32_t start) 2817 { 2818 struct ras_cmd_bad_pages_info_req cmd_input; 2819 struct ras_cmd_bad_pages_info_rsp *output; 2820 uint32_t group, start_group, end_group; 2821 uint32_t pos, pos_in_group; 2822 int r = 0, i; 2823 2824 if (!bps || !count) 2825 return -EINVAL; 2826 2827 output = kmalloc_obj(*output); 2828 if (!output) 2829 return -ENOMEM; 2830 2831 memset(&cmd_input, 0, sizeof(cmd_input)); 2832 2833 start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2834 end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) / 2835 RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2836 2837 pos = start; 2838 for (group = start_group; group < end_group; group++) { 2839 memset(output, 0, sizeof(*output)); 2840 cmd_input.group_index = group; 2841 if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES, 2842 &cmd_input, sizeof(cmd_input), output, sizeof(*output))) 2843 goto out; 2844 2845 if (pos >= output->bp_total_cnt) 2846 goto out; 2847 2848 pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; 2849 for (i = pos_in_group; i < output->bp_in_group; i++, pos++) { 2850 if (!output->records[i].ts) 2851 continue; 2852 2853 bps[r].bp = output->records[i].retired_page; 2854 r++; 2855 if (r >= count) 2856 goto out; 2857 } 2858 } 2859 2860 out: 2861 kfree(output); 2862 return r; 2863 } 2864 2865 static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, 2866 struct amdgpu_hive_info *hive, bool status) 2867 { 2868 struct amdgpu_device *tmp_adev; 2869 2870 if (hive) { 2871 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 2872 amdgpu_ras_set_fed(tmp_adev, status); 2873 } else { 2874 amdgpu_ras_set_fed(adev, status); 2875 } 2876 } 2877 2878 bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) 2879 { 2880 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2881 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 2882 int hive_ras_recovery = 0; 2883 2884 if (hive) { 2885 hive_ras_recovery = atomic_read(&hive->ras_recovery); 2886 amdgpu_put_xgmi_hive(hive); 2887 } 2888 2889 if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 2890 return true; 2891 2892 return false; 2893 } 2894 2895 static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev) 2896 { 2897 if (amdgpu_ras_intr_triggered()) 2898 return RAS_EVENT_TYPE_FATAL; 2899 else 2900 return RAS_EVENT_TYPE_POISON_CONSUMPTION; 2901 } 2902 2903 static void amdgpu_ras_do_recovery(struct work_struct *work) 2904 { 2905 struct amdgpu_ras *ras = 2906 container_of(work, struct amdgpu_ras, recovery_work); 2907 struct amdgpu_device *remote_adev = NULL; 2908 struct amdgpu_device *adev = ras->adev; 2909 struct list_head device_list, *device_list_handle = NULL; 2910 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2911 unsigned int error_query_mode; 2912 enum ras_event_type type; 2913 2914 if (hive) { 2915 atomic_set(&hive->ras_recovery, 1); 2916 2917 /* If any device which is part of the hive received RAS fatal 2918 * error interrupt, set fatal error status on all. This 2919 * condition will need a recovery, and flag will be cleared 2920 * as part of recovery. 2921 */ 2922 list_for_each_entry(remote_adev, &hive->device_list, 2923 gmc.xgmi.head) 2924 if (amdgpu_ras_get_fed_status(remote_adev)) { 2925 amdgpu_ras_set_fed_all(adev, hive, true); 2926 break; 2927 } 2928 } 2929 if (!ras->disable_ras_err_cnt_harvest) { 2930 2931 /* Build list of devices to query RAS related errors */ 2932 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { 2933 device_list_handle = &hive->device_list; 2934 } else { 2935 INIT_LIST_HEAD(&device_list); 2936 list_add_tail(&adev->gmc.xgmi.head, &device_list); 2937 device_list_handle = &device_list; 2938 } 2939 2940 if (amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) { 2941 if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) { 2942 /* wait 500ms to ensure pmfw polling mca bank info done */ 2943 msleep(500); 2944 } 2945 } 2946 2947 type = amdgpu_ras_get_fatal_error_event(adev); 2948 list_for_each_entry(remote_adev, 2949 device_list_handle, gmc.xgmi.head) { 2950 if (amdgpu_uniras_enabled(remote_adev)) { 2951 amdgpu_ras_mgr_update_ras_ecc(remote_adev); 2952 } else { 2953 amdgpu_ras_query_err_status(remote_adev); 2954 amdgpu_ras_log_on_err_counter(remote_adev, type); 2955 } 2956 } 2957 2958 } 2959 2960 if (amdgpu_device_should_recover_gpu(ras->adev)) { 2961 struct amdgpu_reset_context reset_context; 2962 memset(&reset_context, 0, sizeof(reset_context)); 2963 2964 reset_context.method = AMD_RESET_METHOD_NONE; 2965 reset_context.reset_req_dev = adev; 2966 reset_context.src = AMDGPU_RESET_SRC_RAS; 2967 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 2968 2969 /* Perform full reset in fatal error mode */ 2970 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) 2971 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2972 else { 2973 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2974 2975 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { 2976 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; 2977 reset_context.method = AMD_RESET_METHOD_MODE2; 2978 } 2979 2980 /* Fatal error occurs in poison mode, mode1 reset is used to 2981 * recover gpu. 2982 */ 2983 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { 2984 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; 2985 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2986 2987 psp_fatal_error_recovery_quirk(&adev->psp); 2988 } 2989 } 2990 2991 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); 2992 } 2993 atomic_set(&ras->in_recovery, 0); 2994 if (hive) { 2995 atomic_set(&hive->ras_recovery, 0); 2996 amdgpu_put_xgmi_hive(hive); 2997 } 2998 } 2999 3000 /* alloc/realloc bps array */ 3001 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 3002 struct ras_err_handler_data *data, int pages) 3003 { 3004 unsigned int old_space = data->count + data->space_left; 3005 unsigned int new_space = old_space + pages; 3006 unsigned int align_space = ALIGN(new_space, 512); 3007 void *bps = kmalloc_objs(*data->bps, align_space); 3008 3009 if (!bps) { 3010 return -ENOMEM; 3011 } 3012 3013 if (data->bps) { 3014 memcpy(bps, data->bps, 3015 data->count * sizeof(*data->bps)); 3016 kfree(data->bps); 3017 } 3018 3019 data->bps = bps; 3020 data->space_left += align_space - old_space; 3021 return 0; 3022 } 3023 3024 static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, 3025 struct eeprom_table_record *bps, 3026 struct ras_err_data *err_data) 3027 { 3028 struct ta_ras_query_address_input addr_in; 3029 uint32_t socket = 0; 3030 int ret = 0; 3031 3032 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 3033 socket = adev->smuio.funcs->get_socket_id(adev); 3034 3035 /* reinit err_data */ 3036 err_data->err_addr_cnt = 0; 3037 err_data->err_addr_len = adev->umc.retire_unit; 3038 3039 memset(&addr_in, 0, sizeof(addr_in)); 3040 addr_in.ma.err_addr = bps->address; 3041 addr_in.ma.socket_id = socket; 3042 addr_in.ma.ch_inst = bps->mem_channel; 3043 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 3044 /* tell RAS TA the node instance is not used */ 3045 addr_in.ma.node_inst = TA_RAS_INV_NODE; 3046 } else { 3047 addr_in.ma.umc_inst = bps->mcumc_id; 3048 addr_in.ma.node_inst = bps->cu; 3049 } 3050 3051 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3052 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, 3053 &addr_in, NULL, false); 3054 3055 return ret; 3056 } 3057 3058 static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, 3059 struct eeprom_table_record *bps, 3060 struct ras_err_data *err_data) 3061 { 3062 struct ta_ras_query_address_input addr_in; 3063 uint32_t die_id, socket = 0; 3064 3065 if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 3066 socket = adev->smuio.funcs->get_socket_id(adev); 3067 3068 /* although die id is gotten from PA in nps1 mode, the id is 3069 * fitable for any nps mode 3070 */ 3071 if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) 3072 die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, 3073 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); 3074 else 3075 return -EINVAL; 3076 3077 /* reinit err_data */ 3078 err_data->err_addr_cnt = 0; 3079 err_data->err_addr_len = adev->umc.retire_unit; 3080 3081 memset(&addr_in, 0, sizeof(addr_in)); 3082 addr_in.ma.err_addr = bps->address; 3083 addr_in.ma.ch_inst = bps->mem_channel; 3084 addr_in.ma.umc_inst = bps->mcumc_id; 3085 addr_in.ma.node_inst = die_id; 3086 addr_in.ma.socket_id = socket; 3087 3088 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3089 return adev->umc.ras->convert_ras_err_addr(adev, err_data, 3090 &addr_in, NULL, false); 3091 else 3092 return -EINVAL; 3093 } 3094 3095 static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, 3096 struct eeprom_table_record *bps, int count) 3097 { 3098 int j; 3099 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3100 struct ras_err_handler_data *data = con->eh_data; 3101 3102 for (j = 0; j < count; j++) { 3103 if (!data->space_left && 3104 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 3105 return -ENOMEM; 3106 } 3107 3108 if (amdgpu_ras_check_bad_page_unlock(con, 3109 bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { 3110 /* set to U64_MAX to mark it as invalid */ 3111 data->bps[data->count].retired_page = U64_MAX; 3112 data->count++; 3113 data->space_left--; 3114 continue; 3115 } 3116 3117 amdgpu_ras_reserve_page(adev, bps[j].retired_page); 3118 3119 memcpy(&data->bps[data->count], &(bps[j]), 3120 sizeof(struct eeprom_table_record)); 3121 data->count++; 3122 data->space_left--; 3123 con->bad_page_num++; 3124 } 3125 3126 return 0; 3127 } 3128 3129 static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, 3130 struct eeprom_table_record *bps, struct ras_err_data *err_data, 3131 enum amdgpu_memory_partition nps) 3132 { 3133 int i = 0; 3134 uint64_t chan_idx_v2; 3135 enum amdgpu_memory_partition save_nps; 3136 3137 save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; 3138 chan_idx_v2 = bps[0].retired_page & UMC_CHANNEL_IDX_V2; 3139 3140 /*old asics just have pa in eeprom*/ 3141 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { 3142 memcpy(err_data->err_addr, bps, 3143 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); 3144 goto out; 3145 } 3146 3147 for (i = 0; i < adev->umc.retire_unit; i++) 3148 bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); 3149 3150 if (save_nps || chan_idx_v2) { 3151 if (save_nps == nps) { 3152 if (amdgpu_umc_pages_in_a_row(adev, err_data, 3153 bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 3154 return -EINVAL; 3155 for (i = 0; i < adev->umc.retire_unit; i++) { 3156 err_data->err_addr[i].address = bps[0].address; 3157 err_data->err_addr[i].mem_channel = bps[0].mem_channel; 3158 err_data->err_addr[i].bank = bps[0].bank; 3159 err_data->err_addr[i].err_type = bps[0].err_type; 3160 err_data->err_addr[i].mcumc_id = bps[0].mcumc_id; 3161 } 3162 } else { 3163 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) 3164 return -EINVAL; 3165 } 3166 } else { 3167 if (bps[0].address == 0) { 3168 /* for specific old eeprom data, mca address is not stored, 3169 * calc it from pa 3170 */ 3171 if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, 3172 &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) 3173 return -EINVAL; 3174 } 3175 3176 if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { 3177 if (nps == AMDGPU_NPS1_PARTITION_MODE) 3178 memcpy(err_data->err_addr, bps, 3179 sizeof(struct eeprom_table_record) * adev->umc.retire_unit); 3180 else 3181 return -EOPNOTSUPP; 3182 } 3183 } 3184 3185 out: 3186 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); 3187 } 3188 3189 static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, 3190 struct eeprom_table_record *bps, struct ras_err_data *err_data, 3191 enum amdgpu_memory_partition nps) 3192 { 3193 int i = 0; 3194 uint64_t chan_idx_v2; 3195 enum amdgpu_memory_partition save_nps; 3196 3197 if (!amdgpu_ras_smu_eeprom_supported(adev)) { 3198 save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; 3199 chan_idx_v2 = bps->retired_page & UMC_CHANNEL_IDX_V2; 3200 bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); 3201 } else { 3202 /* if pmfw manages eeprom, save_nps is not stored on eeprom, 3203 * we should always convert mca address into physical address, 3204 * make save_nps different from nps 3205 */ 3206 save_nps = nps + 1; 3207 } 3208 3209 if (save_nps == nps) { 3210 if (amdgpu_umc_pages_in_a_row(adev, err_data, 3211 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) 3212 return -EINVAL; 3213 for (i = 0; i < adev->umc.retire_unit; i++) { 3214 err_data->err_addr[i].address = bps->address; 3215 err_data->err_addr[i].mem_channel = bps->mem_channel; 3216 err_data->err_addr[i].bank = bps->bank; 3217 err_data->err_addr[i].err_type = bps->err_type; 3218 err_data->err_addr[i].mcumc_id = bps->mcumc_id; 3219 } 3220 } else { 3221 if (save_nps || chan_idx_v2) { 3222 if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) 3223 return -EINVAL; 3224 } else { 3225 /* for specific old eeprom data, mca address is not stored, 3226 * calc it from pa 3227 */ 3228 if (bps->address == 0) 3229 if (amdgpu_umc_pa2mca(adev, 3230 bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, 3231 &(bps->address), 3232 AMDGPU_NPS1_PARTITION_MODE)) 3233 return -EINVAL; 3234 3235 if (amdgpu_ras_mca2pa(adev, bps, err_data)) 3236 return -EOPNOTSUPP; 3237 } 3238 } 3239 3240 return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, 3241 adev->umc.retire_unit); 3242 } 3243 3244 /* it deal with vram only. */ 3245 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 3246 struct eeprom_table_record *bps, int pages, bool from_rom) 3247 { 3248 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3249 struct ras_err_data err_data; 3250 struct amdgpu_ras_eeprom_control *control = 3251 &adev->psp.ras_context.ras->eeprom_control; 3252 enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; 3253 int ret = 0; 3254 uint32_t i = 0; 3255 3256 if (!con || !con->eh_data || !bps || pages <= 0) 3257 return 0; 3258 3259 if (from_rom) { 3260 err_data.err_addr = 3261 kzalloc_objs(struct eeprom_table_record, 3262 adev->umc.retire_unit); 3263 if (!err_data.err_addr) { 3264 dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); 3265 return -ENOMEM; 3266 } 3267 3268 if (adev->gmc.gmc_funcs->query_mem_partition_mode) 3269 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 3270 } 3271 3272 mutex_lock(&con->recovery_lock); 3273 3274 if (from_rom) { 3275 /* there is no pa recs in V3, so skip pa recs processing */ 3276 if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && 3277 !amdgpu_ras_smu_eeprom_supported(adev)) { 3278 for (i = 0; i < pages; i++) { 3279 if (control->ras_num_recs - i >= adev->umc.retire_unit) { 3280 if ((bps[i].address == bps[i + 1].address) && 3281 (bps[i].mem_channel == bps[i + 1].mem_channel)) { 3282 /* deal with retire_unit records a time */ 3283 ret = __amdgpu_ras_convert_rec_array_from_rom(adev, 3284 &bps[i], &err_data, nps); 3285 i += (adev->umc.retire_unit - 1); 3286 } else { 3287 break; 3288 } 3289 } else { 3290 break; 3291 } 3292 } 3293 } 3294 for (; i < pages; i++) { 3295 ret = __amdgpu_ras_convert_rec_from_rom(adev, 3296 &bps[i], &err_data, nps); 3297 } 3298 3299 con->eh_data->count_saved = con->eh_data->count; 3300 } else { 3301 ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); 3302 } 3303 3304 if (from_rom) 3305 kfree(err_data.err_addr); 3306 mutex_unlock(&con->recovery_lock); 3307 3308 return ret; 3309 } 3310 3311 /* 3312 * write error record array to eeprom, the function should be 3313 * protected by recovery_lock 3314 * new_cnt: new added UE count, excluding reserved bad pages, can be NULL 3315 */ 3316 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, 3317 unsigned long *new_cnt) 3318 { 3319 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3320 struct ras_err_handler_data *data; 3321 struct amdgpu_ras_eeprom_control *control; 3322 int save_count, unit_num, i; 3323 3324 if (!con || !con->eh_data) { 3325 if (new_cnt) 3326 *new_cnt = 0; 3327 3328 return 0; 3329 } 3330 3331 if (!con->eeprom_control.is_eeprom_valid) { 3332 dev_warn(adev->dev, 3333 "Failed to save EEPROM table data because of EEPROM data corruption!"); 3334 if (new_cnt) 3335 *new_cnt = 0; 3336 3337 return 0; 3338 } 3339 3340 mutex_lock(&con->recovery_lock); 3341 control = &con->eeprom_control; 3342 data = con->eh_data; 3343 if (amdgpu_ras_smu_eeprom_supported(adev)) 3344 unit_num = control->ras_num_recs - 3345 control->ras_num_recs_old; 3346 else 3347 unit_num = data->count / adev->umc.retire_unit - 3348 control->ras_num_recs; 3349 3350 save_count = con->bad_page_num - control->ras_num_bad_pages; 3351 mutex_unlock(&con->recovery_lock); 3352 3353 if (new_cnt) 3354 *new_cnt = unit_num; 3355 3356 /* only new entries are saved */ 3357 if (unit_num && save_count) { 3358 /*old asics only save pa to eeprom like before*/ 3359 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { 3360 if (amdgpu_ras_eeprom_append(control, 3361 &data->bps[data->count_saved], unit_num)) { 3362 dev_err(adev->dev, "Failed to save EEPROM table data!"); 3363 return -EIO; 3364 } 3365 } else { 3366 for (i = 0; i < unit_num; i++) { 3367 if (amdgpu_ras_eeprom_append(control, 3368 &data->bps[data->count_saved + 3369 i * adev->umc.retire_unit], 1)) { 3370 dev_err(adev->dev, "Failed to save EEPROM table data!"); 3371 return -EIO; 3372 } 3373 } 3374 } 3375 3376 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); 3377 data->count_saved = data->count; 3378 } 3379 3380 return 0; 3381 } 3382 3383 /* 3384 * read error record array in eeprom and reserve enough space for 3385 * storing new bad pages 3386 */ 3387 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 3388 { 3389 struct amdgpu_ras_eeprom_control *control = 3390 &adev->psp.ras_context.ras->eeprom_control; 3391 struct eeprom_table_record *bps; 3392 int ret, i = 0; 3393 3394 /* no bad page record, skip eeprom access */ 3395 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) 3396 return 0; 3397 3398 bps = kzalloc_objs(*bps, control->ras_num_recs); 3399 if (!bps) 3400 return -ENOMEM; 3401 3402 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); 3403 if (ret) { 3404 dev_err(adev->dev, "Failed to load EEPROM table records!"); 3405 } else { 3406 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { 3407 /*In V3, there is no pa recs, and some cases(when address==0) may be parsed 3408 as pa recs, so add verion check to avoid it. 3409 */ 3410 if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && 3411 !amdgpu_ras_smu_eeprom_supported(adev)) { 3412 for (i = 0; i < control->ras_num_recs; i++) { 3413 if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { 3414 if ((bps[i].address == bps[i + 1].address) && 3415 (bps[i].mem_channel == bps[i + 1].mem_channel)) { 3416 control->ras_num_pa_recs += adev->umc.retire_unit; 3417 i += (adev->umc.retire_unit - 1); 3418 } else { 3419 control->ras_num_mca_recs += 3420 (control->ras_num_recs - i); 3421 break; 3422 } 3423 } else { 3424 control->ras_num_mca_recs += (control->ras_num_recs - i); 3425 break; 3426 } 3427 } 3428 } else { 3429 control->ras_num_mca_recs = control->ras_num_recs; 3430 } 3431 } 3432 3433 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); 3434 if (ret) 3435 goto out; 3436 3437 ret = amdgpu_ras_eeprom_check(control); 3438 if (ret) 3439 goto out; 3440 3441 /* HW not usable */ 3442 if (amdgpu_ras_is_rma(adev)) 3443 ret = -EHWPOISON; 3444 } 3445 3446 out: 3447 kfree(bps); 3448 return ret; 3449 } 3450 3451 static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, 3452 uint64_t addr) 3453 { 3454 struct ras_err_handler_data *data = con->eh_data; 3455 struct amdgpu_device *adev = con->adev; 3456 int i; 3457 3458 if ((addr >= adev->gmc.mc_vram_size && 3459 adev->gmc.mc_vram_size) || 3460 (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) 3461 return -EINVAL; 3462 3463 addr >>= AMDGPU_GPU_PAGE_SHIFT; 3464 for (i = 0; i < data->count; i++) 3465 if (addr == data->bps[i].retired_page) 3466 return 1; 3467 3468 return 0; 3469 } 3470 3471 /* 3472 * check if an address belongs to bad page 3473 * 3474 * Note: this check is only for umc block 3475 */ 3476 static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 3477 uint64_t addr) 3478 { 3479 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3480 int ret = 0; 3481 3482 if (!con || !con->eh_data) 3483 return ret; 3484 3485 mutex_lock(&con->recovery_lock); 3486 ret = amdgpu_ras_check_bad_page_unlock(con, addr); 3487 mutex_unlock(&con->recovery_lock); 3488 return ret; 3489 } 3490 3491 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, 3492 uint32_t max_count) 3493 { 3494 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3495 3496 /* 3497 * amdgpu_bad_page_threshold is used to config 3498 * the threshold for the number of bad pages. 3499 * -1: Threshold is set to default value 3500 * Driver will issue a warning message when threshold is reached 3501 * and continue runtime services. 3502 * 0: Disable bad page retirement 3503 * Driver will not retire bad pages 3504 * which is intended for debugging purpose. 3505 * -2: Threshold is determined by a formula 3506 * that assumes 1 bad page per 100M of local memory. 3507 * Driver will continue runtime services when threhold is reached. 3508 * 0 < threshold < max number of bad page records in EEPROM, 3509 * A user-defined threshold is set 3510 * Driver will halt runtime services when this custom threshold is reached. 3511 */ 3512 if (amdgpu_bad_page_threshold == -2) { 3513 u64 val = adev->gmc.mc_vram_size; 3514 3515 do_div(val, RAS_BAD_PAGE_COVER); 3516 con->bad_page_cnt_threshold = min(lower_32_bits(val), 3517 max_count); 3518 } else if (amdgpu_bad_page_threshold == -1) { 3519 con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4; 3520 } else { 3521 con->bad_page_cnt_threshold = min_t(int, max_count, 3522 amdgpu_bad_page_threshold); 3523 } 3524 } 3525 3526 int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, 3527 enum amdgpu_ras_block block, uint16_t pasid, 3528 pasid_notify pasid_fn, void *data, uint32_t reset) 3529 { 3530 int ret = 0; 3531 struct ras_poison_msg poison_msg; 3532 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3533 3534 memset(&poison_msg, 0, sizeof(poison_msg)); 3535 poison_msg.block = block; 3536 poison_msg.pasid = pasid; 3537 poison_msg.reset = reset; 3538 poison_msg.pasid_fn = pasid_fn; 3539 poison_msg.data = data; 3540 3541 ret = kfifo_put(&con->poison_fifo, poison_msg); 3542 if (!ret) { 3543 dev_err(adev->dev, "Poison message fifo is full!\n"); 3544 return -ENOSPC; 3545 } 3546 3547 return 0; 3548 } 3549 3550 static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, 3551 struct ras_poison_msg *poison_msg) 3552 { 3553 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3554 3555 return kfifo_get(&con->poison_fifo, poison_msg); 3556 } 3557 3558 static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) 3559 { 3560 mutex_init(&ecc_log->lock); 3561 3562 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); 3563 ecc_log->de_queried_count = 0; 3564 ecc_log->consumption_q_count = 0; 3565 } 3566 3567 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) 3568 { 3569 struct radix_tree_iter iter; 3570 void __rcu **slot; 3571 struct ras_ecc_err *ecc_err; 3572 3573 mutex_lock(&ecc_log->lock); 3574 radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { 3575 ecc_err = radix_tree_deref_slot(slot); 3576 kfree(ecc_err->err_pages.pfn); 3577 kfree(ecc_err); 3578 radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); 3579 } 3580 mutex_unlock(&ecc_log->lock); 3581 3582 mutex_destroy(&ecc_log->lock); 3583 ecc_log->de_queried_count = 0; 3584 ecc_log->consumption_q_count = 0; 3585 } 3586 3587 static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, 3588 uint32_t delayed_ms) 3589 { 3590 int ret; 3591 3592 mutex_lock(&con->umc_ecc_log.lock); 3593 ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, 3594 UMC_ECC_NEW_DETECTED_TAG); 3595 mutex_unlock(&con->umc_ecc_log.lock); 3596 3597 if (ret) 3598 schedule_delayed_work(&con->page_retirement_dwork, 3599 msecs_to_jiffies(delayed_ms)); 3600 3601 return ret ? true : false; 3602 } 3603 3604 static void amdgpu_ras_do_page_retirement(struct work_struct *work) 3605 { 3606 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 3607 page_retirement_dwork.work); 3608 struct amdgpu_device *adev = con->adev; 3609 struct ras_err_data err_data; 3610 3611 /* If gpu reset is ongoing, delay retiring the bad pages */ 3612 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { 3613 amdgpu_ras_schedule_retirement_dwork(con, 3614 AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3); 3615 return; 3616 } 3617 3618 amdgpu_ras_error_data_init(&err_data); 3619 3620 amdgpu_umc_handle_bad_pages(adev, &err_data); 3621 3622 amdgpu_ras_error_data_fini(&err_data); 3623 3624 amdgpu_ras_schedule_retirement_dwork(con, 3625 AMDGPU_RAS_RETIRE_PAGE_INTERVAL); 3626 } 3627 3628 static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, 3629 uint32_t poison_creation_count) 3630 { 3631 int ret = 0; 3632 struct ras_ecc_log_info *ecc_log; 3633 struct ras_query_if info; 3634 u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; 3635 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 3636 u64 de_queried_count; 3637 u64 consumption_q_count; 3638 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 3639 3640 memset(&info, 0, sizeof(info)); 3641 info.head.block = AMDGPU_RAS_BLOCK__UMC; 3642 3643 ecc_log = &ras->umc_ecc_log; 3644 ecc_log->de_queried_count = 0; 3645 ecc_log->consumption_q_count = 0; 3646 3647 do { 3648 ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); 3649 if (ret) 3650 return ret; 3651 3652 de_queried_count = ecc_log->de_queried_count; 3653 consumption_q_count = ecc_log->consumption_q_count; 3654 3655 if (de_queried_count && consumption_q_count) 3656 break; 3657 3658 msleep(100); 3659 } while (--timeout); 3660 3661 if (de_queried_count) 3662 schedule_delayed_work(&ras->page_retirement_dwork, 0); 3663 3664 if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) 3665 amdgpu_ras_reset_gpu(adev); 3666 3667 return 0; 3668 } 3669 3670 static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) 3671 { 3672 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3673 struct ras_poison_msg msg; 3674 int ret; 3675 3676 do { 3677 ret = kfifo_get(&con->poison_fifo, &msg); 3678 } while (ret); 3679 } 3680 3681 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, 3682 uint32_t msg_count, uint32_t *gpu_reset) 3683 { 3684 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3685 uint32_t reset_flags = 0, reset = 0; 3686 struct ras_poison_msg msg; 3687 int ret, i; 3688 3689 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 3690 3691 for (i = 0; i < msg_count; i++) { 3692 ret = amdgpu_ras_get_poison_req(adev, &msg); 3693 if (!ret) 3694 continue; 3695 3696 if (msg.pasid_fn) 3697 msg.pasid_fn(adev, msg.pasid, msg.data); 3698 3699 reset_flags |= msg.reset; 3700 } 3701 3702 /* 3703 * Try to ensure poison creation handler is completed first 3704 * to set rma if bad page exceed threshold. 3705 */ 3706 flush_delayed_work(&con->page_retirement_dwork); 3707 3708 /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ 3709 if (reset_flags && !amdgpu_ras_is_rma(adev)) { 3710 if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) 3711 reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; 3712 else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) 3713 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 3714 else 3715 reset = reset_flags; 3716 3717 con->gpu_reset_flags |= reset; 3718 amdgpu_ras_reset_gpu(adev); 3719 3720 *gpu_reset = reset; 3721 3722 /* Wait for gpu recovery to complete */ 3723 flush_work(&con->recovery_work); 3724 } 3725 3726 return 0; 3727 } 3728 3729 static int amdgpu_ras_page_retirement_thread(void *param) 3730 { 3731 struct amdgpu_device *adev = (struct amdgpu_device *)param; 3732 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3733 uint32_t poison_creation_count, msg_count; 3734 uint32_t gpu_reset; 3735 int ret; 3736 3737 while (!kthread_should_stop()) { 3738 3739 wait_event_interruptible(con->page_retirement_wq, 3740 kthread_should_stop() || 3741 atomic_read(&con->page_retirement_req_cnt)); 3742 3743 if (kthread_should_stop()) 3744 break; 3745 3746 mutex_lock(&con->poison_lock); 3747 gpu_reset = 0; 3748 3749 do { 3750 poison_creation_count = atomic_read(&con->poison_creation_count); 3751 ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count); 3752 if (ret == -EIO) 3753 break; 3754 3755 if (poison_creation_count) { 3756 atomic_sub(poison_creation_count, &con->poison_creation_count); 3757 atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); 3758 } 3759 } while (atomic_read(&con->poison_creation_count) && 3760 !atomic_read(&con->poison_consumption_count)); 3761 3762 if (ret != -EIO) { 3763 msg_count = kfifo_len(&con->poison_fifo); 3764 if (msg_count) { 3765 ret = amdgpu_ras_poison_consumption_handler(adev, 3766 msg_count, &gpu_reset); 3767 if ((ret != -EIO) && 3768 (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) 3769 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3770 } 3771 } 3772 3773 if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { 3774 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ 3775 /* Clear poison creation request */ 3776 atomic_set(&con->poison_creation_count, 0); 3777 atomic_set(&con->poison_consumption_count, 0); 3778 3779 /* Clear poison fifo */ 3780 amdgpu_ras_clear_poison_fifo(adev); 3781 3782 /* Clear all poison requests */ 3783 atomic_set(&con->page_retirement_req_cnt, 0); 3784 3785 if (ret == -EIO) { 3786 /* Wait for mode-1 reset to complete */ 3787 down_read(&adev->reset_domain->sem); 3788 up_read(&adev->reset_domain->sem); 3789 } 3790 3791 /* Wake up work to save bad pages to eeprom */ 3792 schedule_delayed_work(&con->page_retirement_dwork, 0); 3793 } else if (gpu_reset) { 3794 /* gpu just completed mode-2 reset or other reset */ 3795 /* Clear poison consumption messages cached in fifo */ 3796 msg_count = kfifo_len(&con->poison_fifo); 3797 if (msg_count) { 3798 amdgpu_ras_clear_poison_fifo(adev); 3799 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3800 } 3801 3802 atomic_set(&con->poison_consumption_count, 0); 3803 3804 /* Wake up work to save bad pages to eeprom */ 3805 schedule_delayed_work(&con->page_retirement_dwork, 0); 3806 } 3807 mutex_unlock(&con->poison_lock); 3808 } 3809 3810 return 0; 3811 } 3812 3813 int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) 3814 { 3815 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3816 struct amdgpu_ras_eeprom_control *control; 3817 int ret; 3818 3819 if (!con || amdgpu_sriov_vf(adev)) 3820 return 0; 3821 3822 if (amdgpu_uniras_enabled(adev)) 3823 return 0; 3824 3825 control = &con->eeprom_control; 3826 con->ras_smu_drv = amdgpu_dpm_get_ras_smu_driver(adev); 3827 3828 ret = amdgpu_ras_eeprom_init(control); 3829 control->is_eeprom_valid = !ret; 3830 3831 if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) 3832 control->ras_num_pa_recs = control->ras_num_recs; 3833 3834 if (adev->umc.ras && 3835 adev->umc.ras->get_retire_flip_bits) 3836 adev->umc.ras->get_retire_flip_bits(adev); 3837 3838 if (control->ras_num_recs && control->is_eeprom_valid) { 3839 ret = amdgpu_ras_load_bad_pages(adev); 3840 if (ret) { 3841 control->is_eeprom_valid = false; 3842 return 0; 3843 } 3844 3845 amdgpu_dpm_send_hbm_bad_pages_num( 3846 adev, control->ras_num_bad_pages); 3847 3848 if (con->update_channel_flag == true) { 3849 amdgpu_dpm_send_hbm_bad_channel_flag( 3850 adev, control->bad_channel_bitmap); 3851 con->update_channel_flag = false; 3852 } 3853 3854 /* The format action is only applied to new ASICs */ 3855 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 && 3856 control->tbl_hdr.version < RAS_TABLE_VER_V3) 3857 if (!amdgpu_ras_eeprom_reset_table(control)) 3858 if (amdgpu_ras_save_bad_pages(adev, NULL)) 3859 dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); 3860 } 3861 3862 return 0; 3863 } 3864 3865 int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) 3866 { 3867 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3868 struct ras_err_handler_data **data; 3869 u32 max_eeprom_records_count = 0; 3870 int ret; 3871 3872 if (!con || amdgpu_sriov_vf(adev)) 3873 return 0; 3874 3875 /* Allow access to RAS EEPROM via debugfs, when the ASIC 3876 * supports RAS and debugfs is enabled, but when 3877 * adev->ras_enabled is unset, i.e. when "ras_enable" 3878 * module parameter is set to 0. 3879 */ 3880 con->adev = adev; 3881 3882 if (!adev->ras_enabled) 3883 return 0; 3884 3885 data = &con->eh_data; 3886 *data = kzalloc_obj(**data); 3887 if (!*data) { 3888 ret = -ENOMEM; 3889 goto out; 3890 } 3891 3892 mutex_init(&con->recovery_lock); 3893 mutex_init(&con->poison_lock); 3894 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 3895 atomic_set(&con->in_recovery, 0); 3896 atomic_set(&con->rma_in_recovery, 0); 3897 con->eeprom_control.bad_channel_bitmap = 0; 3898 3899 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); 3900 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); 3901 3902 if (init_bp_info) { 3903 ret = amdgpu_ras_init_badpage_info(adev); 3904 if (ret) 3905 goto free; 3906 } 3907 3908 mutex_init(&con->page_rsv_lock); 3909 INIT_KFIFO(con->poison_fifo); 3910 mutex_init(&con->page_retirement_lock); 3911 init_waitqueue_head(&con->page_retirement_wq); 3912 atomic_set(&con->page_retirement_req_cnt, 0); 3913 atomic_set(&con->poison_creation_count, 0); 3914 atomic_set(&con->poison_consumption_count, 0); 3915 con->page_retirement_thread = 3916 kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); 3917 if (IS_ERR(con->page_retirement_thread)) { 3918 con->page_retirement_thread = NULL; 3919 dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); 3920 } 3921 3922 INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); 3923 amdgpu_ras_ecc_log_init(&con->umc_ecc_log); 3924 #ifdef CONFIG_X86_MCE_AMD 3925 if ((adev->asic_type == CHIP_ALDEBARAN) && 3926 (adev->gmc.xgmi.connected_to_cpu)) 3927 amdgpu_register_bad_pages_mca_notifier(adev); 3928 #endif 3929 return 0; 3930 3931 free: 3932 kfree((*data)->bps); 3933 kfree(*data); 3934 con->eh_data = NULL; 3935 out: 3936 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); 3937 3938 /* 3939 * Except error threshold exceeding case, other failure cases in this 3940 * function would not fail amdgpu driver init. 3941 */ 3942 if (!amdgpu_ras_is_rma(adev)) 3943 ret = 0; 3944 else 3945 ret = -EINVAL; 3946 3947 return ret; 3948 } 3949 3950 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 3951 { 3952 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3953 struct ras_err_handler_data *data = con->eh_data; 3954 int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES; 3955 bool ret; 3956 3957 /* recovery_init failed to init it, fini is useless */ 3958 if (!data) 3959 return 0; 3960 3961 /* Save all cached bad pages to eeprom */ 3962 do { 3963 flush_delayed_work(&con->page_retirement_dwork); 3964 ret = amdgpu_ras_schedule_retirement_dwork(con, 0); 3965 } while (ret && max_flush_timeout--); 3966 3967 if (con->page_retirement_thread) 3968 kthread_stop(con->page_retirement_thread); 3969 3970 atomic_set(&con->page_retirement_req_cnt, 0); 3971 atomic_set(&con->poison_creation_count, 0); 3972 3973 mutex_destroy(&con->page_rsv_lock); 3974 3975 cancel_work_sync(&con->recovery_work); 3976 3977 cancel_delayed_work_sync(&con->page_retirement_dwork); 3978 3979 amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); 3980 3981 mutex_lock(&con->recovery_lock); 3982 con->eh_data = NULL; 3983 kfree(data->bps); 3984 kfree(data); 3985 mutex_unlock(&con->recovery_lock); 3986 3987 amdgpu_ras_critical_region_init(adev); 3988 #ifdef CONFIG_X86_MCE_AMD 3989 amdgpu_unregister_bad_pages_mca_notifier(adev); 3990 #endif 3991 return 0; 3992 } 3993 /* recovery end */ 3994 3995 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) 3996 { 3997 if (amdgpu_sriov_vf(adev)) { 3998 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 3999 case IP_VERSION(13, 0, 2): 4000 case IP_VERSION(13, 0, 6): 4001 case IP_VERSION(13, 0, 12): 4002 case IP_VERSION(13, 0, 14): 4003 return true; 4004 default: 4005 return false; 4006 } 4007 } 4008 4009 if (adev->asic_type == CHIP_IP_DISCOVERY) { 4010 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4011 case IP_VERSION(13, 0, 0): 4012 case IP_VERSION(13, 0, 6): 4013 case IP_VERSION(13, 0, 10): 4014 case IP_VERSION(13, 0, 12): 4015 case IP_VERSION(13, 0, 14): 4016 case IP_VERSION(14, 0, 3): 4017 return true; 4018 default: 4019 return false; 4020 } 4021 } 4022 4023 return adev->asic_type == CHIP_VEGA10 || 4024 adev->asic_type == CHIP_VEGA20 || 4025 adev->asic_type == CHIP_ARCTURUS || 4026 adev->asic_type == CHIP_ALDEBARAN || 4027 adev->asic_type == CHIP_SIENNA_CICHLID; 4028 } 4029 4030 /* 4031 * this is workaround for vega20 workstation sku, 4032 * force enable gfx ras, ignore vbios gfx ras flag 4033 * due to GC EDC can not write 4034 */ 4035 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) 4036 { 4037 struct atom_context *ctx = adev->mode_info.atom_context; 4038 4039 if (!ctx) 4040 return; 4041 4042 if (strnstr(ctx->vbios_pn, "D16406", 4043 sizeof(ctx->vbios_pn)) || 4044 strnstr(ctx->vbios_pn, "D36002", 4045 sizeof(ctx->vbios_pn))) 4046 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); 4047 } 4048 4049 /* Query ras capablity via atomfirmware interface */ 4050 static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) 4051 { 4052 /* mem_ecc cap */ 4053 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { 4054 dev_info(adev->dev, "MEM ECC is active.\n"); 4055 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | 4056 1 << AMDGPU_RAS_BLOCK__DF); 4057 } else { 4058 dev_info(adev->dev, "MEM ECC is not presented.\n"); 4059 } 4060 4061 /* sram_ecc cap */ 4062 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { 4063 dev_info(adev->dev, "SRAM ECC is active.\n"); 4064 if (!amdgpu_sriov_vf(adev)) 4065 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 4066 1 << AMDGPU_RAS_BLOCK__DF); 4067 else 4068 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | 4069 1 << AMDGPU_RAS_BLOCK__SDMA | 4070 1 << AMDGPU_RAS_BLOCK__GFX); 4071 4072 /* 4073 * VCN/JPEG RAS can be supported on both bare metal and 4074 * SRIOV environment 4075 */ 4076 if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || 4077 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || 4078 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || 4079 amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1)) 4080 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 4081 1 << AMDGPU_RAS_BLOCK__JPEG); 4082 else 4083 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | 4084 1 << AMDGPU_RAS_BLOCK__JPEG); 4085 4086 /* 4087 * XGMI RAS is not supported if xgmi num physical nodes 4088 * is zero 4089 */ 4090 if (!adev->gmc.xgmi.num_physical_nodes) 4091 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); 4092 } else { 4093 dev_info(adev->dev, "SRAM ECC is not presented.\n"); 4094 } 4095 } 4096 4097 /* Query poison mode from umc/df IP callbacks */ 4098 static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) 4099 { 4100 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4101 bool df_poison, umc_poison; 4102 4103 /* poison setting is useless on SRIOV guest */ 4104 if (amdgpu_sriov_vf(adev) || !con) 4105 return; 4106 4107 /* Init poison supported flag, the default value is false */ 4108 if (adev->gmc.xgmi.connected_to_cpu || 4109 adev->gmc.is_app_apu) { 4110 /* enabled by default when GPU is connected to CPU */ 4111 con->poison_supported = true; 4112 } else if (adev->df.funcs && 4113 adev->df.funcs->query_ras_poison_mode && 4114 adev->umc.ras && 4115 adev->umc.ras->query_ras_poison_mode) { 4116 df_poison = 4117 adev->df.funcs->query_ras_poison_mode(adev); 4118 umc_poison = 4119 adev->umc.ras->query_ras_poison_mode(adev); 4120 4121 /* Only poison is set in both DF and UMC, we can support it */ 4122 if (df_poison && umc_poison) 4123 con->poison_supported = true; 4124 else if (df_poison != umc_poison) 4125 dev_warn(adev->dev, 4126 "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", 4127 df_poison, umc_poison); 4128 } 4129 } 4130 4131 /* 4132 * check hardware's ras ability which will be saved in hw_supported. 4133 * if hardware does not support ras, we can skip some ras initializtion and 4134 * forbid some ras operations from IP. 4135 * if software itself, say boot parameter, limit the ras ability. We still 4136 * need allow IP do some limited operations, like disable. In such case, 4137 * we have to initialize ras as normal. but need check if operation is 4138 * allowed or not in each function. 4139 */ 4140 static void amdgpu_ras_check_supported(struct amdgpu_device *adev) 4141 { 4142 adev->ras_hw_enabled = adev->ras_enabled = 0; 4143 4144 if (!amdgpu_ras_asic_supported(adev)) 4145 return; 4146 4147 if (amdgpu_sriov_vf(adev)) { 4148 if (amdgpu_virt_get_ras_capability(adev)) 4149 goto init_ras_enabled_flag; 4150 } 4151 4152 /* query ras capability from psp */ 4153 if (amdgpu_psp_get_ras_capability(&adev->psp)) 4154 goto init_ras_enabled_flag; 4155 4156 /* query ras capablity from bios */ 4157 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4158 amdgpu_ras_query_ras_capablity_from_vbios(adev); 4159 } else { 4160 /* driver only manages a few IP blocks RAS feature 4161 * when GPU is connected cpu through XGMI */ 4162 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | 4163 1 << AMDGPU_RAS_BLOCK__SDMA | 4164 1 << AMDGPU_RAS_BLOCK__MMHUB); 4165 } 4166 4167 /* apply asic specific settings (vega20 only for now) */ 4168 amdgpu_ras_get_quirks(adev); 4169 4170 /* query poison mode from umc/df ip callback */ 4171 amdgpu_ras_query_poison_mode(adev); 4172 4173 init_ras_enabled_flag: 4174 /* hw_supported needs to be aligned with RAS block mask. */ 4175 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; 4176 4177 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : 4178 adev->ras_hw_enabled & amdgpu_ras_mask; 4179 4180 /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ 4181 if (!amdgpu_sriov_vf(adev)) { 4182 adev->aca.is_enabled = 4183 (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || 4184 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || 4185 amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); 4186 } 4187 4188 /* bad page feature is not applicable to specific app platform */ 4189 if (adev->gmc.is_app_apu && 4190 amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) 4191 amdgpu_bad_page_threshold = 0; 4192 } 4193 4194 static void amdgpu_ras_counte_dw(struct work_struct *work) 4195 { 4196 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, 4197 ras_counte_delay_work.work); 4198 struct amdgpu_device *adev = con->adev; 4199 struct drm_device *dev = adev_to_drm(adev); 4200 unsigned long ce_count, ue_count; 4201 int res; 4202 4203 res = pm_runtime_get_sync(dev->dev); 4204 if (res < 0) 4205 goto Out; 4206 4207 /* Cache new values. 4208 */ 4209 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { 4210 atomic_set(&con->ras_ce_count, ce_count); 4211 atomic_set(&con->ras_ue_count, ue_count); 4212 } 4213 4214 Out: 4215 pm_runtime_put_autosuspend(dev->dev); 4216 } 4217 4218 static int amdgpu_get_ras_schema(struct amdgpu_device *adev) 4219 { 4220 return amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 | 4221 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE | 4222 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE | 4223 AMDGPU_RAS_ERROR__PARITY; 4224 } 4225 4226 static void ras_event_mgr_init(struct ras_event_manager *mgr) 4227 { 4228 struct ras_event_state *event_state; 4229 int i; 4230 4231 memset(mgr, 0, sizeof(*mgr)); 4232 atomic64_set(&mgr->seqno, 0); 4233 4234 for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { 4235 event_state = &mgr->event_state[i]; 4236 event_state->last_seqno = RAS_EVENT_INVALID_ID; 4237 atomic64_set(&event_state->count, 0); 4238 } 4239 } 4240 4241 static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) 4242 { 4243 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4244 struct amdgpu_hive_info *hive; 4245 4246 if (!ras) 4247 return; 4248 4249 hive = amdgpu_get_xgmi_hive(adev); 4250 ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; 4251 4252 /* init event manager with node 0 on xgmi system */ 4253 if (!amdgpu_reset_in_recovery(adev)) { 4254 if (!hive || adev->gmc.xgmi.node_id == 0) 4255 ras_event_mgr_init(ras->event_mgr); 4256 } 4257 4258 if (hive) 4259 amdgpu_put_xgmi_hive(hive); 4260 } 4261 4262 static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) 4263 { 4264 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4265 4266 if (!con || (adev->flags & AMD_IS_APU)) 4267 return; 4268 4269 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 4270 case IP_VERSION(13, 0, 2): 4271 case IP_VERSION(13, 0, 6): 4272 case IP_VERSION(13, 0, 12): 4273 con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; 4274 break; 4275 case IP_VERSION(13, 0, 14): 4276 con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1); 4277 break; 4278 default: 4279 break; 4280 } 4281 } 4282 4283 int amdgpu_ras_init(struct amdgpu_device *adev) 4284 { 4285 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4286 int r; 4287 4288 if (con) 4289 return 0; 4290 4291 con = kzalloc(sizeof(*con) + 4292 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT + 4293 sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, 4294 GFP_KERNEL); 4295 if (!con) 4296 return -ENOMEM; 4297 4298 con->adev = adev; 4299 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); 4300 atomic_set(&con->ras_ce_count, 0); 4301 atomic_set(&con->ras_ue_count, 0); 4302 4303 con->objs = (struct ras_manager *)(con + 1); 4304 4305 amdgpu_ras_set_context(adev, con); 4306 4307 amdgpu_ras_check_supported(adev); 4308 4309 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { 4310 /* set gfx block ras context feature for VEGA20 Gaming 4311 * send ras disable cmd to ras ta during ras late init. 4312 */ 4313 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { 4314 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); 4315 4316 return 0; 4317 } 4318 4319 r = 0; 4320 goto release_con; 4321 } 4322 4323 con->update_channel_flag = false; 4324 con->features = 0; 4325 con->schema = 0; 4326 INIT_LIST_HEAD(&con->head); 4327 /* Might need get this flag from vbios. */ 4328 con->flags = RAS_DEFAULT_FLAGS; 4329 4330 /* initialize nbio ras function ahead of any other 4331 * ras functions so hardware fatal error interrupt 4332 * can be enabled as early as possible */ 4333 switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { 4334 case IP_VERSION(7, 4, 0): 4335 case IP_VERSION(7, 4, 1): 4336 case IP_VERSION(7, 4, 4): 4337 if (!adev->gmc.xgmi.connected_to_cpu) 4338 adev->nbio.ras = &nbio_v7_4_ras; 4339 break; 4340 case IP_VERSION(4, 3, 0): 4341 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 4342 /* unlike other generation of nbio ras, 4343 * nbio v4_3 only support fatal error interrupt 4344 * to inform software that DF is freezed due to 4345 * system fatal error event. driver should not 4346 * enable nbio ras in such case. Instead, 4347 * check DF RAS */ 4348 adev->nbio.ras = &nbio_v4_3_ras; 4349 break; 4350 case IP_VERSION(6, 3, 1): 4351 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) 4352 /* unlike other generation of nbio ras, 4353 * nbif v6_3_1 only support fatal error interrupt 4354 * to inform software that DF is freezed due to 4355 * system fatal error event. driver should not 4356 * enable nbio ras in such case. Instead, 4357 * check DF RAS 4358 */ 4359 adev->nbio.ras = &nbif_v6_3_1_ras; 4360 break; 4361 case IP_VERSION(7, 9, 0): 4362 case IP_VERSION(7, 9, 1): 4363 if (!adev->gmc.is_app_apu) 4364 adev->nbio.ras = &nbio_v7_9_ras; 4365 break; 4366 default: 4367 /* nbio ras is not available */ 4368 break; 4369 } 4370 4371 /* nbio ras block needs to be enabled ahead of other ras blocks 4372 * to handle fatal error */ 4373 r = amdgpu_nbio_ras_sw_init(adev); 4374 if (r) 4375 goto release_con; 4376 4377 if (adev->nbio.ras && 4378 adev->nbio.ras->init_ras_controller_interrupt) { 4379 r = adev->nbio.ras->init_ras_controller_interrupt(adev); 4380 if (r) 4381 goto release_con; 4382 } 4383 4384 if (adev->nbio.ras && 4385 adev->nbio.ras->init_ras_err_event_athub_interrupt) { 4386 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); 4387 if (r) 4388 goto release_con; 4389 } 4390 4391 /* Packed socket_id to ras feature mask bits[31:29] */ 4392 if (adev->smuio.funcs && 4393 adev->smuio.funcs->get_socket_id) 4394 con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 4395 AMDGPU_RAS_FEATURES_SOCKETID_SHIFT); 4396 4397 /* Get RAS schema for particular SOC */ 4398 con->schema = amdgpu_get_ras_schema(adev); 4399 4400 amdgpu_ras_init_reserved_vram_size(adev); 4401 4402 if (amdgpu_ras_fs_init(adev)) { 4403 r = -EINVAL; 4404 goto release_con; 4405 } 4406 4407 if (amdgpu_ras_aca_is_supported(adev)) { 4408 if (amdgpu_aca_is_enabled(adev)) 4409 r = amdgpu_aca_init(adev); 4410 else 4411 r = amdgpu_mca_init(adev); 4412 if (r) 4413 goto release_con; 4414 } 4415 4416 con->init_task_pid = task_pid_nr(current); 4417 get_task_comm(con->init_task_comm, current); 4418 4419 mutex_init(&con->critical_region_lock); 4420 INIT_LIST_HEAD(&con->critical_region_head); 4421 4422 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " 4423 "hardware ability[%x] ras_mask[%x]\n", 4424 adev->ras_hw_enabled, adev->ras_enabled); 4425 4426 return 0; 4427 release_con: 4428 amdgpu_ras_set_context(adev, NULL); 4429 kfree(con); 4430 4431 return r; 4432 } 4433 4434 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) 4435 { 4436 if (adev->gmc.xgmi.connected_to_cpu || 4437 adev->gmc.is_app_apu) 4438 return 1; 4439 return 0; 4440 } 4441 4442 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, 4443 struct ras_common_if *ras_block) 4444 { 4445 struct ras_query_if info = { 4446 .head = *ras_block, 4447 }; 4448 4449 if (!amdgpu_persistent_edc_harvesting_supported(adev)) 4450 return 0; 4451 4452 if (amdgpu_ras_query_error_status(adev, &info) != 0) 4453 drm_warn(adev_to_drm(adev), "RAS init query failure"); 4454 4455 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) 4456 drm_warn(adev_to_drm(adev), "RAS init harvest reset failure"); 4457 4458 return 0; 4459 } 4460 4461 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) 4462 { 4463 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4464 4465 if (!con) 4466 return false; 4467 4468 return con->poison_supported; 4469 } 4470 4471 /* helper function to handle common stuff in ip late init phase */ 4472 int amdgpu_ras_block_late_init(struct amdgpu_device *adev, 4473 struct ras_common_if *ras_block) 4474 { 4475 struct amdgpu_ras_block_object *ras_obj = NULL; 4476 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4477 struct ras_query_if *query_info; 4478 unsigned long ue_count, ce_count; 4479 int r; 4480 4481 /* disable RAS feature per IP block if it is not supported */ 4482 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { 4483 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); 4484 return 0; 4485 } 4486 4487 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); 4488 if (r) { 4489 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) { 4490 /* in resume phase, if fail to enable ras, 4491 * clean up all ras fs nodes, and disable ras */ 4492 goto cleanup; 4493 } else 4494 return r; 4495 } 4496 4497 /* check for errors on warm reset edc persisant supported ASIC */ 4498 amdgpu_persistent_edc_harvesting(adev, ras_block); 4499 4500 /* in resume phase, no need to create ras fs node */ 4501 if (adev->in_suspend || amdgpu_reset_in_recovery(adev)) 4502 return 0; 4503 4504 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 4505 if (ras_obj->ras_cb || (ras_obj->hw_ops && 4506 (ras_obj->hw_ops->query_poison_status || 4507 ras_obj->hw_ops->handle_poison_consumption))) { 4508 r = amdgpu_ras_interrupt_add_handler(adev, ras_block); 4509 if (r) 4510 goto cleanup; 4511 } 4512 4513 if (ras_obj->hw_ops && 4514 (ras_obj->hw_ops->query_ras_error_count || 4515 ras_obj->hw_ops->query_ras_error_status)) { 4516 r = amdgpu_ras_sysfs_create(adev, ras_block); 4517 if (r) 4518 goto interrupt; 4519 4520 /* Those are the cached values at init. 4521 */ 4522 query_info = kzalloc_obj(*query_info); 4523 if (!query_info) 4524 return -ENOMEM; 4525 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); 4526 4527 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { 4528 atomic_set(&con->ras_ce_count, ce_count); 4529 atomic_set(&con->ras_ue_count, ue_count); 4530 } 4531 4532 kfree(query_info); 4533 } 4534 4535 return 0; 4536 4537 interrupt: 4538 if (ras_obj->ras_cb) 4539 amdgpu_ras_interrupt_remove_handler(adev, ras_block); 4540 cleanup: 4541 amdgpu_ras_feature_enable(adev, ras_block, 0); 4542 return r; 4543 } 4544 4545 static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, 4546 struct ras_common_if *ras_block) 4547 { 4548 return amdgpu_ras_block_late_init(adev, ras_block); 4549 } 4550 4551 /* helper function to remove ras fs node and interrupt handler */ 4552 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, 4553 struct ras_common_if *ras_block) 4554 { 4555 struct amdgpu_ras_block_object *ras_obj; 4556 if (!ras_block) 4557 return; 4558 4559 amdgpu_ras_sysfs_remove(adev, ras_block); 4560 4561 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); 4562 if (ras_obj->ras_cb) 4563 amdgpu_ras_interrupt_remove_handler(adev, ras_block); 4564 } 4565 4566 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, 4567 struct ras_common_if *ras_block) 4568 { 4569 return amdgpu_ras_block_late_fini(adev, ras_block); 4570 } 4571 4572 /* do some init work after IP late init as dependence. 4573 * and it runs in resume/gpu reset/booting up cases. 4574 */ 4575 void amdgpu_ras_resume(struct amdgpu_device *adev) 4576 { 4577 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4578 struct ras_manager *obj, *tmp; 4579 4580 if (!adev->ras_enabled || !con) { 4581 /* clean ras context for VEGA20 Gaming after send ras disable cmd */ 4582 amdgpu_release_ras_context(adev); 4583 4584 return; 4585 } 4586 4587 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 4588 /* Set up all other IPs which are not implemented. There is a 4589 * tricky thing that IP's actual ras error type should be 4590 * MULTI_UNCORRECTABLE, but as driver does not handle it, so 4591 * ERROR_NONE make sense anyway. 4592 */ 4593 amdgpu_ras_enable_all_features(adev, 1); 4594 4595 /* We enable ras on all hw_supported block, but as boot 4596 * parameter might disable some of them and one or more IP has 4597 * not implemented yet. So we disable them on behalf. 4598 */ 4599 list_for_each_entry_safe(obj, tmp, &con->head, node) { 4600 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { 4601 amdgpu_ras_feature_enable(adev, &obj->head, 0); 4602 /* there should be no any reference. */ 4603 WARN_ON(alive_obj(obj)); 4604 } 4605 } 4606 } 4607 } 4608 4609 void amdgpu_ras_suspend(struct amdgpu_device *adev) 4610 { 4611 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4612 4613 if (!adev->ras_enabled || !con) 4614 return; 4615 4616 amdgpu_ras_disable_all_features(adev, 0); 4617 /* Make sure all ras objects are disabled. */ 4618 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4619 amdgpu_ras_disable_all_features(adev, 1); 4620 } 4621 4622 int amdgpu_ras_late_init(struct amdgpu_device *adev) 4623 { 4624 struct amdgpu_ras_block_list *node, *tmp; 4625 struct amdgpu_ras_block_object *obj; 4626 int r; 4627 4628 amdgpu_ras_event_mgr_init(adev); 4629 4630 if (amdgpu_ras_aca_is_supported(adev)) { 4631 if (amdgpu_reset_in_recovery(adev)) { 4632 if (amdgpu_aca_is_enabled(adev)) 4633 r = amdgpu_aca_reset(adev); 4634 else 4635 r = amdgpu_mca_reset(adev); 4636 if (r) 4637 return r; 4638 } 4639 4640 if (!amdgpu_sriov_vf(adev)) { 4641 if (amdgpu_aca_is_enabled(adev)) 4642 amdgpu_ras_set_aca_debug_mode(adev, false); 4643 else 4644 amdgpu_ras_set_mca_debug_mode(adev, false); 4645 } 4646 } 4647 4648 /* Guest side doesn't need init ras feature */ 4649 if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) 4650 return 0; 4651 4652 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { 4653 obj = node->ras_obj; 4654 if (!obj) { 4655 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); 4656 continue; 4657 } 4658 4659 if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) 4660 continue; 4661 4662 if (obj->ras_late_init) { 4663 r = obj->ras_late_init(adev, &obj->ras_comm); 4664 if (r) { 4665 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", 4666 obj->ras_comm.name, r); 4667 return r; 4668 } 4669 } else 4670 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); 4671 } 4672 4673 amdgpu_ras_check_bad_page_status(adev); 4674 4675 return 0; 4676 } 4677 4678 /* do some fini work before IP fini as dependence */ 4679 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 4680 { 4681 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4682 4683 if (!adev->ras_enabled || !con) 4684 return 0; 4685 4686 4687 /* Need disable ras on all IPs here before ip [hw/sw]fini */ 4688 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4689 amdgpu_ras_disable_all_features(adev, 0); 4690 amdgpu_ras_recovery_fini(adev); 4691 return 0; 4692 } 4693 4694 int amdgpu_ras_fini(struct amdgpu_device *adev) 4695 { 4696 struct amdgpu_ras_block_list *ras_node, *tmp; 4697 struct amdgpu_ras_block_object *obj = NULL; 4698 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4699 4700 if (!adev->ras_enabled || !con) 4701 return 0; 4702 4703 amdgpu_ras_critical_region_fini(adev); 4704 mutex_destroy(&con->critical_region_lock); 4705 4706 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { 4707 if (ras_node->ras_obj) { 4708 obj = ras_node->ras_obj; 4709 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && 4710 obj->ras_fini) 4711 obj->ras_fini(adev, &obj->ras_comm); 4712 else 4713 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); 4714 } 4715 4716 /* Clear ras blocks from ras_list and free ras block list node */ 4717 list_del(&ras_node->node); 4718 kfree(ras_node); 4719 } 4720 4721 amdgpu_ras_fs_fini(adev); 4722 amdgpu_ras_interrupt_remove_all(adev); 4723 4724 if (amdgpu_ras_aca_is_supported(adev)) { 4725 if (amdgpu_aca_is_enabled(adev)) 4726 amdgpu_aca_fini(adev); 4727 else 4728 amdgpu_mca_fini(adev); 4729 } 4730 4731 WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); 4732 4733 if (AMDGPU_RAS_GET_FEATURES(con->features)) 4734 amdgpu_ras_disable_all_features(adev, 0); 4735 4736 cancel_delayed_work_sync(&con->ras_counte_delay_work); 4737 4738 amdgpu_ras_set_context(adev, NULL); 4739 kfree(con); 4740 4741 return 0; 4742 } 4743 4744 bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) 4745 { 4746 struct amdgpu_ras *ras; 4747 4748 ras = amdgpu_ras_get_context(adev); 4749 if (!ras) 4750 return false; 4751 4752 return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4753 } 4754 4755 void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) 4756 { 4757 struct amdgpu_ras *ras; 4758 4759 ras = amdgpu_ras_get_context(adev); 4760 if (ras) { 4761 if (status) 4762 set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4763 else 4764 clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4765 } 4766 } 4767 4768 void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) 4769 { 4770 struct amdgpu_ras *ras; 4771 4772 ras = amdgpu_ras_get_context(adev); 4773 if (ras) { 4774 ras->ras_err_state = 0; 4775 ras->gpu_reset_flags = 0; 4776 } 4777 } 4778 4779 void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, 4780 enum amdgpu_ras_block block) 4781 { 4782 struct amdgpu_ras *ras; 4783 4784 ras = amdgpu_ras_get_context(adev); 4785 if (ras) 4786 set_bit(block, &ras->ras_err_state); 4787 } 4788 4789 bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) 4790 { 4791 struct amdgpu_ras *ras; 4792 4793 ras = amdgpu_ras_get_context(adev); 4794 if (ras) { 4795 if (block == AMDGPU_RAS_BLOCK__ANY) 4796 return (ras->ras_err_state != 0); 4797 else 4798 return test_bit(block, &ras->ras_err_state) || 4799 test_bit(AMDGPU_RAS_BLOCK__LAST, 4800 &ras->ras_err_state); 4801 } 4802 4803 return false; 4804 } 4805 4806 static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) 4807 { 4808 struct amdgpu_ras *ras; 4809 4810 ras = amdgpu_ras_get_context(adev); 4811 if (!ras) 4812 return NULL; 4813 4814 return ras->event_mgr; 4815 } 4816 4817 int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, 4818 const void *caller) 4819 { 4820 struct ras_event_manager *event_mgr; 4821 struct ras_event_state *event_state; 4822 int ret = 0; 4823 4824 if (amdgpu_uniras_enabled(adev)) 4825 return 0; 4826 4827 if (type >= RAS_EVENT_TYPE_COUNT) { 4828 ret = -EINVAL; 4829 goto out; 4830 } 4831 4832 event_mgr = __get_ras_event_mgr(adev); 4833 if (!event_mgr) { 4834 ret = -EINVAL; 4835 goto out; 4836 } 4837 4838 event_state = &event_mgr->event_state[type]; 4839 event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); 4840 atomic64_inc(&event_state->count); 4841 4842 out: 4843 if (ret && caller) 4844 dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", 4845 (int)type, caller, ret); 4846 4847 return ret; 4848 } 4849 4850 u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) 4851 { 4852 struct ras_event_manager *event_mgr; 4853 u64 id; 4854 4855 if (type >= RAS_EVENT_TYPE_COUNT) 4856 return RAS_EVENT_INVALID_ID; 4857 4858 switch (type) { 4859 case RAS_EVENT_TYPE_FATAL: 4860 case RAS_EVENT_TYPE_POISON_CREATION: 4861 case RAS_EVENT_TYPE_POISON_CONSUMPTION: 4862 event_mgr = __get_ras_event_mgr(adev); 4863 if (!event_mgr) 4864 return RAS_EVENT_INVALID_ID; 4865 4866 id = event_mgr->event_state[type].last_seqno; 4867 break; 4868 case RAS_EVENT_TYPE_INVALID: 4869 default: 4870 id = RAS_EVENT_INVALID_ID; 4871 break; 4872 } 4873 4874 return id; 4875 } 4876 4877 int amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) 4878 { 4879 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { 4880 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 4881 enum ras_event_type type = RAS_EVENT_TYPE_FATAL; 4882 u64 event_id = RAS_EVENT_INVALID_ID; 4883 4884 if (amdgpu_uniras_enabled(adev)) 4885 return 0; 4886 4887 if (!amdgpu_ras_mark_ras_event(adev, type)) 4888 event_id = amdgpu_ras_acquire_event_id(adev, type); 4889 4890 RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" 4891 "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); 4892 4893 amdgpu_ras_set_fed(adev, true); 4894 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 4895 amdgpu_ras_reset_gpu(adev); 4896 } 4897 4898 return -EBUSY; 4899 } 4900 4901 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) 4902 { 4903 if (adev->asic_type == CHIP_VEGA20 && 4904 adev->pm.fw_version <= 0x283400) { 4905 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && 4906 amdgpu_ras_intr_triggered(); 4907 } 4908 4909 return false; 4910 } 4911 4912 void amdgpu_release_ras_context(struct amdgpu_device *adev) 4913 { 4914 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 4915 4916 if (!con) 4917 return; 4918 4919 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { 4920 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); 4921 amdgpu_ras_set_context(adev, NULL); 4922 kfree(con); 4923 } 4924 } 4925 4926 #ifdef CONFIG_X86_MCE_AMD 4927 static struct amdgpu_device *find_adev(uint32_t node_id) 4928 { 4929 int i; 4930 struct amdgpu_device *adev = NULL; 4931 4932 for (i = 0; i < mce_adev_list.num_gpu; i++) { 4933 adev = mce_adev_list.devs[i]; 4934 4935 if (adev && adev->gmc.xgmi.connected_to_cpu && 4936 adev->gmc.xgmi.physical_node_id == node_id) 4937 break; 4938 adev = NULL; 4939 } 4940 4941 return adev; 4942 } 4943 4944 #define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) 4945 #define GET_UMC_INST(m) (((m) >> 21) & 0x7) 4946 #define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4)) 4947 #define GPU_ID_OFFSET 8 4948 4949 static int amdgpu_bad_page_notifier(struct notifier_block *nb, 4950 unsigned long val, void *data) 4951 { 4952 struct mce *m = (struct mce *)data; 4953 struct amdgpu_device *adev = NULL; 4954 uint32_t gpu_id = 0; 4955 uint32_t umc_inst = 0, ch_inst = 0; 4956 4957 /* 4958 * If the error was generated in UMC_V2, which belongs to GPU UMCs, 4959 * and error occurred in DramECC (Extended error code = 0) then only 4960 * process the error, else bail out. 4961 */ 4962 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && 4963 (XEC(m->status, 0x3f) == 0x0))) 4964 return NOTIFY_DONE; 4965 4966 /* 4967 * If it is correctable error, return. 4968 */ 4969 if (mce_is_correctable(m)) 4970 return NOTIFY_OK; 4971 4972 /* 4973 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. 4974 */ 4975 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; 4976 4977 adev = find_adev(gpu_id); 4978 if (!adev) { 4979 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__, 4980 gpu_id); 4981 return NOTIFY_DONE; 4982 } 4983 4984 /* 4985 * If it is uncorrectable error, then find out UMC instance and 4986 * channel index. 4987 */ 4988 umc_inst = GET_UMC_INST(m->ipid); 4989 ch_inst = GET_CHAN_INDEX(m->ipid); 4990 4991 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", 4992 umc_inst, ch_inst); 4993 4994 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) 4995 return NOTIFY_OK; 4996 else 4997 return NOTIFY_DONE; 4998 } 4999 5000 static struct notifier_block amdgpu_bad_page_nb = { 5001 .notifier_call = amdgpu_bad_page_notifier, 5002 .priority = MCE_PRIO_UC, 5003 }; 5004 5005 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) 5006 { 5007 /* 5008 * Add the adev to the mce_adev_list. 5009 * During mode2 reset, amdgpu device is temporarily 5010 * removed from the mgpu_info list which can cause 5011 * page retirement to fail. 5012 * Use this list instead of mgpu_info to find the amdgpu 5013 * device on which the UMC error was reported. 5014 */ 5015 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; 5016 5017 /* 5018 * Register the x86 notifier only once 5019 * with MCE subsystem. 5020 */ 5021 if (notifier_registered == false) { 5022 mce_register_decode_chain(&amdgpu_bad_page_nb); 5023 notifier_registered = true; 5024 } 5025 } 5026 static void amdgpu_unregister_bad_pages_mca_notifier(struct amdgpu_device *adev) 5027 { 5028 int i, j; 5029 5030 if (!notifier_registered && !mce_adev_list.num_gpu) 5031 return; 5032 for (i = 0, j = 0; i < mce_adev_list.num_gpu; i++) { 5033 if (mce_adev_list.devs[i] == adev) 5034 mce_adev_list.devs[i] = NULL; 5035 if (!mce_adev_list.devs[i]) 5036 ++j; 5037 } 5038 5039 if (j == mce_adev_list.num_gpu) { 5040 mce_adev_list.num_gpu = 0; 5041 /* Unregister x86 notifier with MCE subsystem. */ 5042 if (notifier_registered) { 5043 mce_unregister_decode_chain(&amdgpu_bad_page_nb); 5044 notifier_registered = false; 5045 } 5046 } 5047 } 5048 #endif 5049 5050 struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) 5051 { 5052 if (!adev) 5053 return NULL; 5054 5055 return adev->psp.ras_context.ras; 5056 } 5057 5058 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) 5059 { 5060 if (!adev) 5061 return -EINVAL; 5062 5063 adev->psp.ras_context.ras = ras_con; 5064 return 0; 5065 } 5066 5067 /* check if ras is supported on block, say, sdma, gfx */ 5068 int amdgpu_ras_is_supported(struct amdgpu_device *adev, 5069 unsigned int block) 5070 { 5071 int ret = 0; 5072 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5073 5074 if (block >= AMDGPU_RAS_BLOCK_COUNT) 5075 return 0; 5076 5077 ret = ras && (adev->ras_enabled & (1 << block)); 5078 5079 /* For the special asic with mem ecc enabled but sram ecc 5080 * not enabled, even if the ras block is not supported on 5081 * .ras_enabled, if the asic supports poison mode and the 5082 * ras block has ras configuration, it can be considered 5083 * that the ras block supports ras function. 5084 */ 5085 if (!ret && 5086 (block == AMDGPU_RAS_BLOCK__GFX || 5087 block == AMDGPU_RAS_BLOCK__SDMA || 5088 block == AMDGPU_RAS_BLOCK__VCN || 5089 block == AMDGPU_RAS_BLOCK__JPEG) && 5090 (amdgpu_ras_mask & (1 << block)) && 5091 amdgpu_ras_is_poison_mode_supported(adev) && 5092 amdgpu_ras_get_ras_block(adev, block, 0)) 5093 ret = 1; 5094 5095 return ret; 5096 } 5097 5098 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) 5099 { 5100 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5101 5102 /* mode1 is the only selection for RMA status */ 5103 if (amdgpu_ras_is_rma(adev)) { 5104 ras->gpu_reset_flags = 0; 5105 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; 5106 } 5107 5108 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { 5109 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 5110 int hive_ras_recovery = 0; 5111 5112 if (hive) { 5113 hive_ras_recovery = atomic_read(&hive->ras_recovery); 5114 amdgpu_put_xgmi_hive(hive); 5115 } 5116 /* In the case of multiple GPUs, after a GPU has started 5117 * resetting all GPUs on hive, other GPUs do not need to 5118 * trigger GPU reset again. 5119 */ 5120 if (!hive_ras_recovery) 5121 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 5122 else 5123 atomic_set(&ras->in_recovery, 0); 5124 } else { 5125 flush_work(&ras->recovery_work); 5126 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); 5127 } 5128 5129 return 0; 5130 } 5131 5132 int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) 5133 { 5134 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5135 int ret = 0; 5136 5137 if (con) { 5138 ret = amdgpu_mca_smu_set_debug_mode(adev, enable); 5139 if (!ret) 5140 con->is_aca_debug_mode = enable; 5141 } 5142 5143 return ret; 5144 } 5145 5146 int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) 5147 { 5148 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5149 int ret = 0; 5150 5151 if (con) { 5152 if (amdgpu_aca_is_enabled(adev)) 5153 ret = amdgpu_aca_smu_set_debug_mode(adev, enable); 5154 else 5155 ret = amdgpu_mca_smu_set_debug_mode(adev, enable); 5156 if (!ret) 5157 con->is_aca_debug_mode = enable; 5158 } 5159 5160 return ret; 5161 } 5162 5163 bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev) 5164 { 5165 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5166 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 5167 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 5168 5169 if (!con) 5170 return false; 5171 5172 if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || 5173 (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) 5174 return con->is_aca_debug_mode; 5175 else 5176 return true; 5177 } 5178 5179 bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, 5180 unsigned int *error_query_mode) 5181 { 5182 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5183 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 5184 const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; 5185 5186 if (!con) { 5187 *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; 5188 return false; 5189 } 5190 5191 if (amdgpu_sriov_vf(adev)) { 5192 *error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY; 5193 } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) { 5194 *error_query_mode = 5195 (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; 5196 } else { 5197 *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; 5198 } 5199 5200 return true; 5201 } 5202 5203 /* Register each ip ras block into amdgpu ras */ 5204 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, 5205 struct amdgpu_ras_block_object *ras_block_obj) 5206 { 5207 struct amdgpu_ras_block_list *ras_node; 5208 if (!adev || !ras_block_obj) 5209 return -EINVAL; 5210 5211 ras_node = kzalloc_obj(*ras_node); 5212 if (!ras_node) 5213 return -ENOMEM; 5214 5215 INIT_LIST_HEAD(&ras_node->node); 5216 ras_node->ras_obj = ras_block_obj; 5217 list_add_tail(&ras_node->node, &adev->ras_list); 5218 5219 return 0; 5220 } 5221 5222 void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) 5223 { 5224 if (!err_type_name) 5225 return; 5226 5227 switch (err_type) { 5228 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: 5229 sprintf(err_type_name, "correctable"); 5230 break; 5231 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: 5232 sprintf(err_type_name, "uncorrectable"); 5233 break; 5234 default: 5235 sprintf(err_type_name, "unknown"); 5236 break; 5237 } 5238 } 5239 5240 bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, 5241 const struct amdgpu_ras_err_status_reg_entry *reg_entry, 5242 uint32_t instance, 5243 uint32_t *memory_id) 5244 { 5245 uint32_t err_status_lo_data, err_status_lo_offset; 5246 5247 if (!reg_entry) 5248 return false; 5249 5250 err_status_lo_offset = 5251 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 5252 reg_entry->seg_lo, reg_entry->reg_lo); 5253 err_status_lo_data = RREG32(err_status_lo_offset); 5254 5255 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && 5256 !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) 5257 return false; 5258 5259 *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); 5260 5261 return true; 5262 } 5263 5264 bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, 5265 const struct amdgpu_ras_err_status_reg_entry *reg_entry, 5266 uint32_t instance, 5267 unsigned long *err_cnt) 5268 { 5269 uint32_t err_status_hi_data, err_status_hi_offset; 5270 5271 if (!reg_entry) 5272 return false; 5273 5274 err_status_hi_offset = 5275 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, 5276 reg_entry->seg_hi, reg_entry->reg_hi); 5277 err_status_hi_data = RREG32(err_status_hi_offset); 5278 5279 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && 5280 !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) 5281 /* keep the check here in case we need to refer to the result later */ 5282 dev_dbg(adev->dev, "Invalid err_info field\n"); 5283 5284 /* read err count */ 5285 *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); 5286 5287 return true; 5288 } 5289 5290 void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, 5291 const struct amdgpu_ras_err_status_reg_entry *reg_list, 5292 uint32_t reg_list_size, 5293 const struct amdgpu_ras_memory_id_entry *mem_list, 5294 uint32_t mem_list_size, 5295 uint32_t instance, 5296 uint32_t err_type, 5297 unsigned long *err_count) 5298 { 5299 uint32_t memory_id; 5300 unsigned long err_cnt; 5301 char err_type_name[16]; 5302 uint32_t i, j; 5303 5304 for (i = 0; i < reg_list_size; i++) { 5305 /* query memory_id from err_status_lo */ 5306 if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], 5307 instance, &memory_id)) 5308 continue; 5309 5310 /* query err_cnt from err_status_hi */ 5311 if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], 5312 instance, &err_cnt) || 5313 !err_cnt) 5314 continue; 5315 5316 *err_count += err_cnt; 5317 5318 /* log the errors */ 5319 amdgpu_ras_get_error_type_name(err_type, err_type_name); 5320 if (!mem_list) { 5321 /* memory_list is not supported */ 5322 dev_info(adev->dev, 5323 "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", 5324 err_cnt, err_type_name, 5325 reg_list[i].block_name, 5326 instance, memory_id); 5327 } else { 5328 for (j = 0; j < mem_list_size; j++) { 5329 if (memory_id == mem_list[j].memory_id) { 5330 dev_info(adev->dev, 5331 "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", 5332 err_cnt, err_type_name, 5333 reg_list[i].block_name, 5334 instance, mem_list[j].name); 5335 break; 5336 } 5337 } 5338 } 5339 } 5340 } 5341 5342 void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, 5343 const struct amdgpu_ras_err_status_reg_entry *reg_list, 5344 uint32_t reg_list_size, 5345 uint32_t instance) 5346 { 5347 uint32_t err_status_lo_offset, err_status_hi_offset; 5348 uint32_t i; 5349 5350 for (i = 0; i < reg_list_size; i++) { 5351 err_status_lo_offset = 5352 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 5353 reg_list[i].seg_lo, reg_list[i].reg_lo); 5354 err_status_hi_offset = 5355 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_list[i].hwip, instance, 5356 reg_list[i].seg_hi, reg_list[i].reg_hi); 5357 WREG32(err_status_lo_offset, 0); 5358 WREG32(err_status_hi_offset, 0); 5359 } 5360 } 5361 5362 int amdgpu_ras_error_data_init(struct ras_err_data *err_data) 5363 { 5364 memset(err_data, 0, sizeof(*err_data)); 5365 5366 INIT_LIST_HEAD(&err_data->err_node_list); 5367 5368 return 0; 5369 } 5370 5371 static void amdgpu_ras_error_node_release(struct ras_err_node *err_node) 5372 { 5373 if (!err_node) 5374 return; 5375 5376 list_del(&err_node->node); 5377 kvfree(err_node); 5378 } 5379 5380 void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) 5381 { 5382 struct ras_err_node *err_node, *tmp; 5383 5384 list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) 5385 amdgpu_ras_error_node_release(err_node); 5386 } 5387 5388 static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, 5389 struct amdgpu_smuio_mcm_config_info *mcm_info) 5390 { 5391 struct ras_err_node *err_node; 5392 struct amdgpu_smuio_mcm_config_info *ref_id; 5393 5394 if (!err_data || !mcm_info) 5395 return NULL; 5396 5397 for_each_ras_error(err_node, err_data) { 5398 ref_id = &err_node->err_info.mcm_info; 5399 5400 if (mcm_info->socket_id == ref_id->socket_id && 5401 mcm_info->die_id == ref_id->die_id) 5402 return err_node; 5403 } 5404 5405 return NULL; 5406 } 5407 5408 static struct ras_err_node *amdgpu_ras_error_node_new(void) 5409 { 5410 struct ras_err_node *err_node; 5411 5412 err_node = kvzalloc_obj(*err_node); 5413 if (!err_node) 5414 return NULL; 5415 5416 INIT_LIST_HEAD(&err_node->node); 5417 5418 return err_node; 5419 } 5420 5421 static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b) 5422 { 5423 struct ras_err_node *nodea = container_of(a, struct ras_err_node, node); 5424 struct ras_err_node *nodeb = container_of(b, struct ras_err_node, node); 5425 struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; 5426 struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; 5427 5428 if (unlikely(infoa->socket_id != infob->socket_id)) 5429 return infoa->socket_id - infob->socket_id; 5430 else 5431 return infoa->die_id - infob->die_id; 5432 5433 return 0; 5434 } 5435 5436 static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, 5437 struct amdgpu_smuio_mcm_config_info *mcm_info) 5438 { 5439 struct ras_err_node *err_node; 5440 5441 err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info); 5442 if (err_node) 5443 return &err_node->err_info; 5444 5445 err_node = amdgpu_ras_error_node_new(); 5446 if (!err_node) 5447 return NULL; 5448 5449 memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); 5450 5451 err_data->err_list_count++; 5452 list_add_tail(&err_node->node, &err_data->err_node_list); 5453 list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); 5454 5455 return &err_node->err_info; 5456 } 5457 5458 int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, 5459 struct amdgpu_smuio_mcm_config_info *mcm_info, 5460 u64 count) 5461 { 5462 struct ras_err_info *err_info; 5463 5464 if (!err_data || !mcm_info) 5465 return -EINVAL; 5466 5467 if (!count) 5468 return 0; 5469 5470 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5471 if (!err_info) 5472 return -EINVAL; 5473 5474 err_info->ue_count += count; 5475 err_data->ue_count += count; 5476 5477 return 0; 5478 } 5479 5480 int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, 5481 struct amdgpu_smuio_mcm_config_info *mcm_info, 5482 u64 count) 5483 { 5484 struct ras_err_info *err_info; 5485 5486 if (!err_data || !mcm_info) 5487 return -EINVAL; 5488 5489 if (!count) 5490 return 0; 5491 5492 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5493 if (!err_info) 5494 return -EINVAL; 5495 5496 err_info->ce_count += count; 5497 err_data->ce_count += count; 5498 5499 return 0; 5500 } 5501 5502 int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, 5503 struct amdgpu_smuio_mcm_config_info *mcm_info, 5504 u64 count) 5505 { 5506 struct ras_err_info *err_info; 5507 5508 if (!err_data || !mcm_info) 5509 return -EINVAL; 5510 5511 if (!count) 5512 return 0; 5513 5514 err_info = amdgpu_ras_error_get_info(err_data, mcm_info); 5515 if (!err_info) 5516 return -EINVAL; 5517 5518 err_info->de_count += count; 5519 err_data->de_count += count; 5520 5521 return 0; 5522 } 5523 5524 #define mmMP0_SMN_C2PMSG_92 0x1609C 5525 #define mmMP0_SMN_C2PMSG_126 0x160BE 5526 static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, 5527 u32 instance) 5528 { 5529 u32 socket_id, aid_id, hbm_id; 5530 u32 fw_status; 5531 u32 boot_error; 5532 u64 reg_addr; 5533 5534 /* The pattern for smn addressing in other SOC could be different from 5535 * the one for aqua_vanjaram. We should revisit the code if the pattern 5536 * is changed. In such case, replace the aqua_vanjaram implementation 5537 * with more common helper */ 5538 reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + 5539 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5540 fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5541 5542 reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + 5543 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5544 boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5545 5546 socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); 5547 aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); 5548 hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1); 5549 5550 if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) 5551 dev_info(adev->dev, 5552 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", 5553 socket_id, aid_id, hbm_id, fw_status); 5554 5555 if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) 5556 dev_info(adev->dev, 5557 "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", 5558 socket_id, aid_id, fw_status); 5559 5560 if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) 5561 dev_info(adev->dev, 5562 "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", 5563 socket_id, aid_id, fw_status); 5564 5565 if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) 5566 dev_info(adev->dev, 5567 "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", 5568 socket_id, aid_id, fw_status); 5569 5570 if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) 5571 dev_info(adev->dev, 5572 "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", 5573 socket_id, aid_id, fw_status); 5574 5575 if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) 5576 dev_info(adev->dev, 5577 "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", 5578 socket_id, aid_id, fw_status); 5579 5580 if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) 5581 dev_info(adev->dev, 5582 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", 5583 socket_id, aid_id, hbm_id, fw_status); 5584 5585 if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) 5586 dev_info(adev->dev, 5587 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", 5588 socket_id, aid_id, hbm_id, fw_status); 5589 5590 if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error)) 5591 dev_info(adev->dev, 5592 "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n", 5593 socket_id, aid_id, fw_status); 5594 5595 if (AMDGPU_RAS_GPU_ERR_GENERIC(boot_error)) 5596 dev_info(adev->dev, 5597 "socket: %d, aid: %d, fw_status: 0x%x, Boot Controller Generic Error\n", 5598 socket_id, aid_id, fw_status); 5599 } 5600 5601 static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, 5602 u32 instance) 5603 { 5604 u64 reg_addr; 5605 u32 reg_data; 5606 int retry_loop; 5607 5608 reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + 5609 amdgpu_reg_get_smn_base64(adev, MP0_HWIP, instance); 5610 5611 for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { 5612 reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); 5613 if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) 5614 return false; 5615 else 5616 msleep(1); 5617 } 5618 5619 return true; 5620 } 5621 5622 void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) 5623 { 5624 u32 i; 5625 5626 for (i = 0; i < num_instances; i++) { 5627 if (amdgpu_ras_boot_error_detected(adev, i)) 5628 amdgpu_ras_boot_time_error_reporting(adev, i); 5629 } 5630 } 5631 5632 int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) 5633 { 5634 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5635 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; 5636 uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; 5637 int ret = 0; 5638 5639 if (amdgpu_ras_check_critical_address(adev, start)) 5640 return 0; 5641 5642 mutex_lock(&con->page_rsv_lock); 5643 ret = amdgpu_vram_mgr_query_page_status(mgr, start); 5644 if (ret == -ENOENT) 5645 ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE); 5646 mutex_unlock(&con->page_rsv_lock); 5647 5648 return ret; 5649 } 5650 5651 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, 5652 const char *fmt, ...) 5653 { 5654 struct va_format vaf; 5655 va_list args; 5656 5657 va_start(args, fmt); 5658 vaf.fmt = fmt; 5659 vaf.va = &args; 5660 5661 if (RAS_EVENT_ID_IS_VALID(event_id)) 5662 dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); 5663 else 5664 dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); 5665 5666 va_end(args); 5667 } 5668 5669 bool amdgpu_ras_is_rma(struct amdgpu_device *adev) 5670 { 5671 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5672 5673 if (amdgpu_uniras_enabled(adev)) 5674 return amdgpu_ras_mgr_is_rma(adev); 5675 5676 if (!con) 5677 return false; 5678 5679 return con->is_rma; 5680 } 5681 5682 int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, 5683 struct amdgpu_bo *bo) 5684 { 5685 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5686 struct amdgpu_vram_mgr_resource *vres; 5687 struct ras_critical_region *region; 5688 struct gpu_buddy_block *block; 5689 int ret = 0; 5690 5691 if (!bo || !bo->tbo.resource) 5692 return -EINVAL; 5693 5694 vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); 5695 5696 mutex_lock(&con->critical_region_lock); 5697 5698 /* Check if the bo had been recorded */ 5699 list_for_each_entry(region, &con->critical_region_head, node) 5700 if (region->bo == bo) 5701 goto out; 5702 5703 /* Record new critical amdgpu bo */ 5704 list_for_each_entry(block, &vres->blocks, link) { 5705 region = kzalloc_obj(*region); 5706 if (!region) { 5707 ret = -ENOMEM; 5708 goto out; 5709 } 5710 region->bo = bo; 5711 region->start = amdgpu_vram_mgr_block_start(block); 5712 region->size = amdgpu_vram_mgr_block_size(block); 5713 list_add_tail(®ion->node, &con->critical_region_head); 5714 } 5715 5716 out: 5717 mutex_unlock(&con->critical_region_lock); 5718 5719 return ret; 5720 } 5721 5722 static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) 5723 { 5724 amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory); 5725 } 5726 5727 static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) 5728 { 5729 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5730 struct ras_critical_region *region, *tmp; 5731 5732 mutex_lock(&con->critical_region_lock); 5733 list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { 5734 list_del(®ion->node); 5735 kfree(region); 5736 } 5737 mutex_unlock(&con->critical_region_lock); 5738 } 5739 5740 bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) 5741 { 5742 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5743 struct ras_critical_region *region; 5744 bool ret = false; 5745 5746 mutex_lock(&con->critical_region_lock); 5747 list_for_each_entry(region, &con->critical_region_head, node) { 5748 if ((region->start <= addr) && 5749 (addr < (region->start + region->size))) { 5750 ret = true; 5751 break; 5752 } 5753 } 5754 mutex_unlock(&con->critical_region_lock); 5755 5756 return ret; 5757 } 5758 5759 void amdgpu_ras_pre_reset(struct amdgpu_device *adev, 5760 struct list_head *device_list) 5761 { 5762 struct amdgpu_device *tmp_adev = NULL; 5763 5764 list_for_each_entry(tmp_adev, device_list, reset_list) { 5765 if (amdgpu_uniras_enabled(tmp_adev)) 5766 amdgpu_ras_mgr_pre_reset(tmp_adev); 5767 } 5768 } 5769 5770 void amdgpu_ras_post_reset(struct amdgpu_device *adev, 5771 struct list_head *device_list) 5772 { 5773 struct amdgpu_device *tmp_adev = NULL; 5774 5775 list_for_each_entry(tmp_adev, device_list, reset_list) { 5776 if (amdgpu_uniras_enabled(tmp_adev)) 5777 amdgpu_ras_mgr_post_reset(tmp_adev); 5778 } 5779 } 5780