Lines Matching full:ras

53 static const char *RAS_FS_NAME = "ras";
94 /* ras block link */
187 "RAS WARN: input address 0x%llx is invalid.\n", in amdgpu_reserve_page_direct()
192 "RAS WARN: 0x%llx has already been marked as bad page!\n", in amdgpu_reserve_page_direct()
213 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
215 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
455 "RAS inject mask(0x%x) isn't supported and force it to 0.\n", in amdgpu_ras_instance_mask_check()
481 "Adjust RAS inject mask 0x%x to 0x%x\n", in amdgpu_ras_instance_mask_check()
486 * DOC: AMDGPU RAS debugfs control interface
507 * - 0: disable RAS on the block. Take ::head as its data.
508 * - 1: enable RAS on the block. Take ::head as its data.
522 * echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
523 * echo "enable <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
524 …ect <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
548 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
549 * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
550 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
554 * To check disable/enable, see "ras" features at,
555 * /sys/class/drm/card[0/1/2...]/device/ras/features
558 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count
562 * Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask
563 * to see which blocks support RAS on a particular asic.
575 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
610 dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", in amdgpu_ras_debugfs_ctrl_write()
614 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", in amdgpu_ras_debugfs_ctrl_write()
638 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
648 * echo 1 > ../ras/ras_eeprom_reset
694 * DOC: AMDGPU RAS sysfs Error Count Interface
697 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
755 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); in put_obj()
856 /* If hardware does not support ras, then do not create obj. in __amdgpu_ras_feature_enable()
857 * But if hardware support ras, we can create the obj. in __amdgpu_ras_feature_enable()
858 * Ras framework checks con->hw_supported to see if it need do in __amdgpu_ras_feature_enable()
860 * IP checks con->support to see if it need disable ras. in __amdgpu_ras_feature_enable()
896 /* For non-gfx ip, do not enable ras feature if it is not allowed */ in amdgpu_ras_feature_enable()
898 /* Force issue enable or disable ras feature commands */ in amdgpu_ras_feature_enable()
903 /* Only enable gfx ras feature from host side */ in amdgpu_ras_feature_enable()
925 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
954 /* There is no harm to issue a ras TA cmd regardless of in amdgpu_ras_feature_enable_on_boot()
955 * the currecnt ras state. in amdgpu_ras_feature_enable_on_boot()
961 /* With old ras TA, we might fail to enable ras. in amdgpu_ras_feature_enable_on_boot()
969 "RAS INFO: %s setup object\n", in amdgpu_ras_feature_enable_on_boot()
973 /* setup the object then issue a ras TA disable cmd.*/ in amdgpu_ras_feature_enable_on_boot()
978 /* gfx block ras disable cmd must send to ras-ta */ in amdgpu_ras_feature_enable_on_boot()
984 /* clean gfx block ras features flag */ in amdgpu_ras_feature_enable_on_boot()
1035 * bypass psp. vbios enable ras for us. in amdgpu_ras_enable_all_features()
1055 * bypass psp. vbios enable ras for us. in amdgpu_ras_enable_all_features()
1093 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
1112 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_get_ecc_info() local
1119 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
1121 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1122 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
1123 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1128 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1129 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
1130 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1132 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1133 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
1134 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1136 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1137 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
1138 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1475 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status_helper()
1628 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_count()
1640 /* skip ras error reset in gpu reset */ in amdgpu_ras_reset_error_count()
1718 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1743 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1797 * all the ip blocks that support query ras error counters/status
1801 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1824 /* query all the ip blocks that support ras query interface */ in amdgpu_ras_query_error_count()
1872 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
1875 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
2007 size += sysfs_emit_at(buf, size, "ras version: %u.%u.%u\n", in amdgpu_ras_sysfs_version_show()
2165 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
2173 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
2179 * echo true > .../ras/auto_reboot
2219 * of RAS IPs during ras recovery. in amdgpu_ras_debugfs_create_ctrl_node()
2301 /* ras fs */
2359 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
2380 /* ras fs end */
2395 * If the current interrupt is caused by a non-fatal RAS error, skip in amdgpu_ras_interrupt_fatal_error_handler()
2410 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
2411 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
2412 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
2414 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
2415 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
2416 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
2447 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
2465 "GPU reset for %s RAS poison consumption is issued!\n", in amdgpu_ras_interrupt_poison_consumption_handler()
2559 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); in amdgpu_ras_interrupt_handler()
2642 /* in case we registe the IH before enable ras feature */ in amdgpu_ras_interrupt_add_handler()
2706 * PCIE_BIF IP has one different isr by ras controller in amdgpu_ras_log_on_err_counter()
2707 * interrupt, the specific ras counter query will be in amdgpu_ras_log_on_err_counter()
2757 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
2883 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_in_recovery() local
2891 if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) in amdgpu_ras_in_recovery()
2907 struct amdgpu_ras *ras = in amdgpu_ras_do_recovery() local
2910 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery()
2919 /* If any device which is part of the hive received RAS fatal in amdgpu_ras_do_recovery()
2931 if (!ras->disable_ras_err_cnt_harvest) { in amdgpu_ras_do_recovery()
2933 /* Build list of devices to query RAS related errors */ in amdgpu_ras_do_recovery()
2962 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
2972 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) in amdgpu_ras_do_recovery()
2977 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { in amdgpu_ras_do_recovery()
2978 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; in amdgpu_ras_do_recovery()
2985 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { in amdgpu_ras_do_recovery()
2986 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_do_recovery()
2993 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
2995 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_do_recovery()
3046 /* tell RAS TA the node instance is not used */ in amdgpu_ras_mca2pa_by_idx()
3053 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_mca2pa_by_idx()
3054 ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, in amdgpu_ras_mca2pa_by_idx()
3073 if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) in amdgpu_ras_mca2pa()
3074 die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, in amdgpu_ras_mca2pa()
3090 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_mca2pa()
3091 return adev->umc.ras->convert_ras_err_addr(adev, err_data, in amdgpu_ras_mca2pa()
3253 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_add_bad_pages()
3392 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
3408 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { in amdgpu_ras_load_bad_pages()
3637 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_poison_creation_handler() local
3645 ecc_log = &ras->umc_ecc_log; in amdgpu_ras_poison_creation_handler()
3664 schedule_delayed_work(&ras->page_retirement_dwork, 0); in amdgpu_ras_poison_creation_handler()
3666 if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) in amdgpu_ras_poison_creation_handler()
3776 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ in amdgpu_ras_page_retirement_thread()
3833 if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) in amdgpu_ras_init_badpage_info()
3836 if (adev->umc.ras && in amdgpu_ras_init_badpage_info()
3837 adev->umc.ras->get_retire_flip_bits) in amdgpu_ras_init_badpage_info()
3838 adev->umc.ras->get_retire_flip_bits(adev); in amdgpu_ras_init_badpage_info()
3861 dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); in amdgpu_ras_init_badpage_info()
3877 /* Allow access to RAS EEPROM via debugfs, when the ASIC in amdgpu_ras_recovery_init()
3878 * supports RAS and debugfs is enabled, but when in amdgpu_ras_recovery_init()
3938 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
4036 * force enable gfx ras, ignore vbios gfx ras flag
4053 /* Query ras capablity via atomfirmware interface */
4077 * VCN/JPEG RAS can be supported on both bare metal and in amdgpu_ras_query_ras_capablity_from_vbios()
4091 * XGMI RAS is not supported if xgmi num physical nodes in amdgpu_ras_query_ras_capablity_from_vbios()
4118 adev->umc.ras && in amdgpu_ras_query_poison_mode()
4119 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_query_poison_mode()
4123 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
4136 * check hardware's ras ability which will be saved in hw_supported.
4137 * if hardware does not support ras, we can skip some ras initializtion and
4138 * forbid some ras operations from IP.
4139 * if software itself, say boot parameter, limit the ras ability. We still
4141 * we have to initialize ras as normal. but need check if operation is
4156 /* query ras capability from psp */ in amdgpu_ras_check_supported()
4160 /* query ras capablity from bios */ in amdgpu_ras_check_supported()
4164 /* driver only manages a few IP blocks RAS feature in amdgpu_ras_check_supported()
4178 /* hw_supported needs to be aligned with RAS block mask. */ in amdgpu_ras_check_supported()
4248 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_event_mgr_init() local
4251 if (!ras) in amdgpu_ras_event_mgr_init()
4255 ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; in amdgpu_ras_event_mgr_init()
4260 ras_event_mgr_init(ras->event_mgr); in amdgpu_ras_event_mgr_init()
4316 /* set gfx block ras context feature for VEGA20 Gaming in amdgpu_ras_init()
4317 * send ras disable cmd to ras ta during ras late init. in amdgpu_ras_init()
4336 /* initialize nbio ras function ahead of any other in amdgpu_ras_init()
4337 * ras functions so hardware fatal error interrupt in amdgpu_ras_init()
4344 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
4348 /* unlike other generation of nbio ras, in amdgpu_ras_init()
4352 * enable nbio ras in such case. Instead, in amdgpu_ras_init()
4353 * check DF RAS */ in amdgpu_ras_init()
4354 adev->nbio.ras = &nbio_v4_3_ras; in amdgpu_ras_init()
4358 /* unlike other generation of nbio ras, in amdgpu_ras_init()
4362 * enable nbio ras in such case. Instead, in amdgpu_ras_init()
4363 * check DF RAS in amdgpu_ras_init()
4365 adev->nbio.ras = &nbif_v6_3_1_ras; in amdgpu_ras_init()
4370 adev->nbio.ras = &nbio_v7_9_ras; in amdgpu_ras_init()
4373 /* nbio ras is not available */ in amdgpu_ras_init()
4377 /* nbio ras block needs to be enabled ahead of other ras blocks in amdgpu_ras_init()
4383 if (adev->nbio.ras && in amdgpu_ras_init()
4384 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
4385 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
4390 if (adev->nbio.ras && in amdgpu_ras_init()
4391 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
4392 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
4397 /* Packed socket_id to ras feature mask bits[31:29] */ in amdgpu_ras_init()
4403 /* Get RAS schema for particular SOC */ in amdgpu_ras_init()
4428 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
4459 drm_warn(adev_to_drm(adev), "RAS init query failure"); in amdgpu_persistent_edc_harvesting()
4462 drm_warn(adev_to_drm(adev), "RAS init harvest reset failure"); in amdgpu_persistent_edc_harvesting()
4487 /* disable RAS feature per IP block if it is not supported */ in amdgpu_ras_block_late_init()
4496 /* in resume phase, if fail to enable ras, in amdgpu_ras_block_late_init()
4497 * clean up all ras fs nodes, and disable ras */ in amdgpu_ras_block_late_init()
4506 /* in resume phase, no need to create ras fs node */ in amdgpu_ras_block_late_init()
4557 /* helper function to remove ras fs node and interrupt handler */
4587 /* clean ras context for VEGA20 Gaming after send ras disable cmd */ in amdgpu_ras_resume()
4595 * tricky thing that IP's actual ras error type should be in amdgpu_ras_resume()
4601 /* We enable ras on all hw_supported block, but as boot in amdgpu_ras_resume()
4623 /* Make sure all ras objects are disabled. */ in amdgpu_ras_suspend()
4654 /* Guest side doesn't need init ras feature */ in amdgpu_ras_late_init()
4661 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
4693 /* Need disable ras on all IPs here before ip [hw/sw]fini */ in amdgpu_ras_pre_fini()
4722 /* Clear ras blocks from ras_list and free ras block list node */ in amdgpu_ras_fini()
4752 struct amdgpu_ras *ras; in amdgpu_ras_get_fed_status() local
4754 ras = amdgpu_ras_get_context(adev); in amdgpu_ras_get_fed_status()
4755 if (!ras) in amdgpu_ras_get_fed_status()
4758 return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_get_fed_status()
4763 struct amdgpu_ras *ras; in amdgpu_ras_set_fed() local
4765 ras = amdgpu_ras_get_context(adev); in amdgpu_ras_set_fed()
4766 if (ras) { in amdgpu_ras_set_fed()
4768 set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_set_fed()
4770 clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); in amdgpu_ras_set_fed()
4776 struct amdgpu_ras *ras; in amdgpu_ras_clear_err_state() local
4778 ras = amdgpu_ras_get_context(adev); in amdgpu_ras_clear_err_state()
4779 if (ras) { in amdgpu_ras_clear_err_state()
4780 ras->ras_err_state = 0; in amdgpu_ras_clear_err_state()
4781 ras->gpu_reset_flags = 0; in amdgpu_ras_clear_err_state()
4788 struct amdgpu_ras *ras; in amdgpu_ras_set_err_poison() local
4790 ras = amdgpu_ras_get_context(adev); in amdgpu_ras_set_err_poison()
4791 if (ras) in amdgpu_ras_set_err_poison()
4792 set_bit(block, &ras->ras_err_state); in amdgpu_ras_set_err_poison()
4797 struct amdgpu_ras *ras; in amdgpu_ras_is_err_state() local
4799 ras = amdgpu_ras_get_context(adev); in amdgpu_ras_is_err_state()
4800 if (ras) { in amdgpu_ras_is_err_state()
4802 return (ras->ras_err_state != 0); in amdgpu_ras_is_err_state()
4804 return test_bit(block, &ras->ras_err_state) || in amdgpu_ras_is_err_state()
4806 &ras->ras_err_state); in amdgpu_ras_is_err_state()
4814 struct amdgpu_ras *ras; in __get_ras_event_mgr() local
4816 ras = amdgpu_ras_get_context(adev); in __get_ras_event_mgr()
4817 if (!ras) in __get_ras_event_mgr()
4820 return ras->event_mgr; in __get_ras_event_mgr()
4850 dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", in amdgpu_ras_mark_ras_event_caller()
4886 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_global_ras_isr() local
4900 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_global_ras_isr()
5061 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
5069 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
5073 /* check if ras is supported on block, say, sdma, gfx */
5078 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_is_supported() local
5083 ret = ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
5086 * not enabled, even if the ras block is not supported on in amdgpu_ras_is_supported()
5088 * ras block has ras configuration, it can be considered in amdgpu_ras_is_supported()
5089 * that the ras block supports ras function. in amdgpu_ras_is_supported()
5106 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_reset_gpu() local
5110 ras->gpu_reset_flags = 0; in amdgpu_ras_reset_gpu()
5111 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_reset_gpu()
5114 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { in amdgpu_ras_reset_gpu()
5127 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
5129 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_reset_gpu()
5131 flush_work(&ras->recovery_work); in amdgpu_ras_reset_gpu()
5132 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
5209 /* Register each ip ras block into amdgpu ras */