1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include "ras_sys.h" 25 #include "amdgpu_ras_mgr.h" 26 #include "amdgpu_ras.h" 27 #include "amdgpu_reset.h" 28 29 static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data) 30 { 31 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 32 int ret; 33 uint64_t seq_no; 34 35 ret = amdgpu_ras_global_ras_isr(adev); 36 if (ret) 37 return ret; 38 39 seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE); 40 RAS_DEV_INFO(adev, 41 "{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n", 42 seq_no); 43 44 return amdgpu_ras_process_handle_unexpected_interrupt(adev, data); 45 } 46 47 static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context *ras_core, 48 void *data) 49 { 50 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 51 struct ras_event_req *req = (struct ras_event_req *)data; 52 pasid_notify pasid_fn; 53 54 if (!req) 55 return -EINVAL; 56 57 if (req->pasid_fn) { 58 pasid_fn = (pasid_notify)req->pasid_fn; 59 pasid_fn(adev, req->pasid, req->data); 60 } 61 62 return 0; 63 } 64 65 static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core, 66 enum ras_seqno_type seqno_type, uint64_t *seqno) 67 { 68 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 69 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 70 struct ras_event_manager *event_mgr; 71 struct ras_event_state *event_state; 72 struct amdgpu_hive_info *hive; 73 enum ras_event_type event_type; 74 uint64_t seq_no; 75 76 if (!ras_mgr || !seqno || 77 (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)) 78 return -EINVAL; 79 80 switch (seqno_type) { 81 case RAS_SEQNO_TYPE_UE: 82 event_type = RAS_EVENT_TYPE_FATAL; 83 break; 84 case RAS_SEQNO_TYPE_CE: 85 case RAS_SEQNO_TYPE_DE: 86 event_type = RAS_EVENT_TYPE_POISON_CREATION; 87 break; 88 case RAS_SEQNO_TYPE_POISON_CONSUMPTION: 89 event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION; 90 break; 91 default: 92 event_type = RAS_EVENT_TYPE_INVALID; 93 break; 94 } 95 96 hive = amdgpu_get_xgmi_hive(adev); 97 event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr; 98 event_state = &event_mgr->event_state[event_type]; 99 if ((event_type == RAS_EVENT_TYPE_FATAL) && amdgpu_ras_in_recovery(adev)) { 100 seq_no = event_state->last_seqno; 101 } else { 102 seq_no = atomic64_inc_return(&event_mgr->seqno); 103 event_state->last_seqno = seq_no; 104 atomic64_inc(&event_state->count); 105 } 106 amdgpu_put_xgmi_hive(hive); 107 108 *seqno = seq_no; 109 return 0; 110 111 } 112 113 static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core, 114 enum ras_notify_event event_id, void *data) 115 { 116 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev); 117 int ret = 0; 118 119 switch (event_id) { 120 case RAS_EVENT_ID__BAD_PAGE_DETECTED: 121 schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); 122 break; 123 case RAS_EVENT_ID__POISON_CONSUMPTION: 124 amdgpu_ras_sys_poison_consumption_event(ras_core, data); 125 break; 126 case RAS_EVENT_ID__RESERVE_BAD_PAGE: 127 ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t *)data); 128 break; 129 case RAS_EVENT_ID__FATAL_ERROR_DETECTED: 130 ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data); 131 break; 132 case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM: 133 ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, *(uint32_t *)data); 134 break; 135 case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP: 136 ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, *(uint32_t *)data); 137 break; 138 case RAS_EVENT_ID__DEVICE_RMA: 139 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL); 140 ret = amdgpu_dpm_send_rma_reason(ras_core->dev); 141 break; 142 case RAS_EVENT_ID__RESET_GPU: 143 ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); 144 break; 145 case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN: 146 ret = amdgpu_ras_process_begin(ras_core->dev); 147 break; 148 case RAS_EVENT_ID__RAS_EVENT_PROC_END: 149 ret = amdgpu_ras_process_end(ras_core->dev); 150 break; 151 default: 152 RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id); 153 break; 154 } 155 156 return ret; 157 } 158 159 static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context *ras_core) 160 { 161 return ktime_get_real_seconds(); 162 } 163 164 static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context *ras_core, 165 uint32_t *status) 166 { 167 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 168 uint32_t gpu_status = 0; 169 170 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) 171 gpu_status |= RAS_GPU_STATUS__IN_RESET; 172 173 if (amdgpu_sriov_vf(adev)) 174 gpu_status |= RAS_GPU_STATUS__IS_VF; 175 176 *status = gpu_status; 177 178 return 0; 179 } 180 181 static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context *ras_core, 182 struct device_system_info *dev_info) 183 { 184 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 185 186 dev_info->device_id = adev->pdev->device; 187 dev_info->vendor_id = adev->pdev->vendor; 188 dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev); 189 190 return 0; 191 } 192 193 static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core, 194 bool down, bool try) 195 { 196 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 197 int ret = 0; 198 199 if (down && try) 200 ret = down_read_trylock(&adev->reset_domain->sem); 201 else if (down) 202 down_read(&adev->reset_domain->sem); 203 else 204 up_read(&adev->reset_domain->sem); 205 206 return ret; 207 } 208 209 static bool amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context *ras_core) 210 { 211 return !!atomic_read(&amdgpu_ras_in_intr); 212 } 213 214 static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core, 215 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) 216 { 217 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; 218 struct psp_context *psp = &adev->psp; 219 struct psp_ring *psp_ring; 220 struct ta_mem_context *mem_ctx; 221 222 if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) { 223 psp_ring = &psp->km_ring; 224 gpu_mem->mem_bo = adev->firmware.rbuf; 225 gpu_mem->mem_size = psp_ring->ring_size; 226 gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr; 227 gpu_mem->mem_cpu_addr = psp_ring->ring_mem; 228 } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) { 229 gpu_mem->mem_bo = psp->cmd_buf_bo; 230 gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE; 231 gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr; 232 gpu_mem->mem_cpu_addr = psp->cmd_buf_mem; 233 } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) { 234 gpu_mem->mem_bo = psp->fence_buf_bo; 235 gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE; 236 gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr; 237 gpu_mem->mem_cpu_addr = psp->fence_buf; 238 } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) { 239 gpu_mem->mem_bo = psp->fw_pri_bo; 240 gpu_mem->mem_size = PSP_1_MEG; 241 gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr; 242 gpu_mem->mem_cpu_addr = psp->fw_pri_buf; 243 } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) { 244 mem_ctx = &psp->ras_context.context.mem_context; 245 gpu_mem->mem_bo = mem_ctx->shared_bo; 246 gpu_mem->mem_size = mem_ctx->shared_mem_size; 247 gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr; 248 gpu_mem->mem_cpu_addr = mem_ctx->shared_buf; 249 } else { 250 return -EINVAL; 251 } 252 253 if (!gpu_mem->mem_bo || !gpu_mem->mem_size || 254 !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) { 255 RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is invalid!\n"); 256 return -ENOMEM; 257 } 258 259 return 0; 260 } 261 262 static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core, 263 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) 264 { 265 266 return 0; 267 } 268 269 const struct ras_sys_func amdgpu_ras_sys_fn = { 270 .ras_notifier = amdgpu_ras_sys_event_notifier, 271 .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp, 272 .gen_seqno = amdgpu_ras_sys_gen_seqno, 273 .check_gpu_status = amdgpu_ras_sys_check_gpu_status, 274 .get_device_system_info = amdgpu_ras_sys_get_device_system_info, 275 .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock, 276 .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt, 277 .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem, 278 .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem, 279 }; 280