Lines Matching refs:adev
31 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, in amdgpu_umc_convert_error_address() argument
35 switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { in amdgpu_umc_convert_error_address()
37 umc_v6_7_convert_error_address(adev, in amdgpu_umc_convert_error_address()
41 dev_warn(adev->dev, in amdgpu_umc_convert_error_address()
49 int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, in amdgpu_umc_page_retirement_mca() argument
60 kcalloc(adev->umc.max_ras_err_cnt_per_query, in amdgpu_umc_page_retirement_mca()
63 dev_warn(adev->dev, in amdgpu_umc_page_retirement_mca()
69 err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query; in amdgpu_umc_page_retirement_mca()
74 ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, in amdgpu_umc_page_retirement_mca()
80 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, in amdgpu_umc_page_retirement_mca()
82 amdgpu_ras_save_bad_pages(adev, NULL); in amdgpu_umc_page_retirement_mca()
94 void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, in amdgpu_umc_handle_bad_pages() argument
98 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_umc_handle_bad_pages()
103 amdgpu_ras_get_error_query_mode(adev, &error_query_mode); in amdgpu_umc_handle_bad_pages()
106 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); in amdgpu_umc_handle_bad_pages()
109 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_umc_handle_bad_pages()
110 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_umc_handle_bad_pages()
111 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); in amdgpu_umc_handle_bad_pages()
113 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_umc_handle_bad_pages()
114 adev->umc.ras->ras_block.hw_ops->query_ras_error_address && in amdgpu_umc_handle_bad_pages()
115 adev->umc.max_ras_err_cnt_per_query) { in amdgpu_umc_handle_bad_pages()
117 kcalloc(adev->umc.max_ras_err_cnt_per_query, in amdgpu_umc_handle_bad_pages()
124 dev_warn(adev->dev, "Failed to alloc memory for " in amdgpu_umc_handle_bad_pages()
127 err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query; in amdgpu_umc_handle_bad_pages()
132 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status); in amdgpu_umc_handle_bad_pages()
136 if (adev->umc.ras && in amdgpu_umc_handle_bad_pages()
137 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_umc_handle_bad_pages()
138 adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status); in amdgpu_umc_handle_bad_pages()
140 if (adev->umc.ras && in amdgpu_umc_handle_bad_pages()
141 adev->umc.ras->ecc_info_query_ras_error_address && in amdgpu_umc_handle_bad_pages()
142 adev->umc.max_ras_err_cnt_per_query) { in amdgpu_umc_handle_bad_pages()
144 kcalloc(adev->umc.max_ras_err_cnt_per_query, in amdgpu_umc_handle_bad_pages()
151 dev_warn(adev->dev, "Failed to alloc memory for " in amdgpu_umc_handle_bad_pages()
154 err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query; in amdgpu_umc_handle_bad_pages()
159 adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status); in amdgpu_umc_handle_bad_pages()
168 amdgpu_ras_add_bad_pages(adev, err_data->err_addr, in amdgpu_umc_handle_bad_pages()
170 amdgpu_ras_save_bad_pages(adev, &err_count); in amdgpu_umc_handle_bad_pages()
172 amdgpu_dpm_send_hbm_bad_pages_num(adev, in amdgpu_umc_handle_bad_pages()
176 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); in amdgpu_umc_handle_bad_pages()
188 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, in amdgpu_umc_do_page_retirement() argument
194 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_umc_do_page_retirement()
196 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); in amdgpu_umc_do_page_retirement()
197 amdgpu_umc_handle_bad_pages(adev, ras_error_status); in amdgpu_umc_do_page_retirement()
200 (reset || amdgpu_ras_is_rma(adev))) { in amdgpu_umc_do_page_retirement()
202 amdgpu_ras_reset_gpu(adev); in amdgpu_umc_do_page_retirement()
208 int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, in amdgpu_umc_pasid_poison_handler() argument
214 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_umc_pasid_poison_handler()
215 adev->gmc.is_app_apu) { in amdgpu_umc_pasid_poison_handler()
220 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); in amdgpu_umc_pasid_poison_handler()
221 amdgpu_ras_reset_gpu(adev); in amdgpu_umc_pasid_poison_handler()
226 if (!amdgpu_sriov_vf(adev)) { in amdgpu_umc_pasid_poison_handler()
227 if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) { in amdgpu_umc_pasid_poison_handler()
232 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); in amdgpu_umc_pasid_poison_handler()
238 ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); in amdgpu_umc_pasid_poison_handler()
248 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_umc_pasid_poison_handler()
251 ret = amdgpu_ras_put_poison_req(adev, in amdgpu_umc_pasid_poison_handler()
259 if (adev->virt.ops && adev->virt.ops->ras_poison_handler) in amdgpu_umc_pasid_poison_handler()
260 adev->virt.ops->ras_poison_handler(adev, block); in amdgpu_umc_pasid_poison_handler()
262 dev_warn(adev->dev, in amdgpu_umc_pasid_poison_handler()
269 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, in amdgpu_umc_poison_handler() argument
272 return amdgpu_umc_pasid_poison_handler(adev, in amdgpu_umc_poison_handler()
276 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, in amdgpu_umc_process_ras_data_cb() argument
280 return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, in amdgpu_umc_process_ras_data_cb()
284 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) in amdgpu_umc_ras_sw_init() argument
289 if (!adev->umc.ras) in amdgpu_umc_ras_sw_init()
292 ras = adev->umc.ras; in amdgpu_umc_ras_sw_init()
294 err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); in amdgpu_umc_ras_sw_init()
296 dev_err(adev->dev, "Failed to register umc ras block!\n"); in amdgpu_umc_ras_sw_init()
300 strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc"); in amdgpu_umc_ras_sw_init()
303 adev->umc.ras_if = &ras->ras_block.ras_comm; in amdgpu_umc_ras_sw_init()
314 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) in amdgpu_umc_ras_late_init() argument
318 r = amdgpu_ras_block_late_init(adev, ras_block); in amdgpu_umc_ras_late_init()
322 if (amdgpu_sriov_vf(adev)) in amdgpu_umc_ras_late_init()
325 if (amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_umc_ras_late_init()
326 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); in amdgpu_umc_ras_late_init()
332 if (adev->umc.ras && in amdgpu_umc_ras_late_init()
333 adev->umc.ras->err_cnt_init) in amdgpu_umc_ras_late_init()
334 adev->umc.ras->err_cnt_init(adev); in amdgpu_umc_ras_late_init()
339 amdgpu_ras_block_late_fini(adev, ras_block); in amdgpu_umc_ras_late_init()
343 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, in amdgpu_umc_process_ecc_irq() argument
347 struct ras_common_if *ras_if = adev->umc.ras_if; in amdgpu_umc_process_ecc_irq()
357 amdgpu_ras_interrupt_dispatch(adev, &ih_data); in amdgpu_umc_process_ecc_irq()
390 int amdgpu_umc_loop_channels(struct amdgpu_device *adev, in amdgpu_umc_loop_channels() argument
398 if (adev->umc.node_inst_num) { in amdgpu_umc_loop_channels()
400 ret = func(adev, node_inst, umc_inst, ch_inst, data); in amdgpu_umc_loop_channels()
402 dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n", in amdgpu_umc_loop_channels()
409 ret = func(adev, 0, umc_inst, ch_inst, data); in amdgpu_umc_loop_channels()
411 dev_err(adev->dev, "Umc %d ch %d func returns %d\n", in amdgpu_umc_loop_channels()
421 int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev, in amdgpu_umc_update_ecc_status() argument
424 if (adev->umc.ras->update_ecc_status) in amdgpu_umc_update_ecc_status()
425 return adev->umc.ras->update_ecc_status(adev, in amdgpu_umc_update_ecc_status()
430 int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, in amdgpu_umc_logs_ecc_err() argument
433 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_umc_logs_ecc_err()
449 int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev, in amdgpu_umc_pages_in_a_row() argument
456 err_data->err_addr_len = adev->umc.retire_unit; in amdgpu_umc_pages_in_a_row()
459 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) in amdgpu_umc_pages_in_a_row()
460 return adev->umc.ras->convert_ras_err_addr(adev, err_data, NULL, in amdgpu_umc_pages_in_a_row()
466 int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, in amdgpu_umc_lookup_bad_pages_in_a_row() argument
472 err_data.err_addr = kcalloc(adev->umc.retire_unit, in amdgpu_umc_lookup_bad_pages_in_a_row()
475 dev_warn(adev->dev, "Failed to alloc memory in bad page lookup!\n"); in amdgpu_umc_lookup_bad_pages_in_a_row()
479 ret = amdgpu_umc_pages_in_a_row(adev, &err_data, pa_addr); in amdgpu_umc_lookup_bad_pages_in_a_row()
483 for (i = 0; i < adev->umc.retire_unit; i++) { in amdgpu_umc_lookup_bad_pages_in_a_row()
496 int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, in amdgpu_umc_mca_to_addr() argument
511 if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { in amdgpu_umc_mca_to_addr()
512 ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in, in amdgpu_umc_mca_to_addr()