1 /* SPDX-License-Identifier: MIT */ 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #ifndef __RAS_H__ 26 #define __RAS_H__ 27 #include "ras_sys.h" 28 #include "ras_umc.h" 29 #include "ras_aca.h" 30 #include "ras_eeprom.h" 31 #include "ras_core_status.h" 32 #include "ras_process.h" 33 #include "ras_gfx.h" 34 #include "ras_cmd.h" 35 #include "ras_nbio.h" 36 #include "ras_mp1.h" 37 #include "ras_psp.h" 38 #include "ras_log_ring.h" 39 40 #define RAS_HW_ERR "[Hardware Error]: " 41 42 #define RAS_GPU_PAGE_SHIFT 12 43 #define RAS_ADDR_TO_PFN(addr) ((addr) >> RAS_GPU_PAGE_SHIFT) 44 #define RAS_PFN_TO_ADDR(pfn) ((pfn) << RAS_GPU_PAGE_SHIFT) 45 46 #define RAS_CORE_RESET_GPU 0x10000 47 48 #define GPU_RESET_CAUSE_POISON (RAS_CORE_RESET_GPU | 0x0001) 49 #define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002) 50 #define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004) 51 52 enum ras_block_id { 53 RAS_BLOCK_ID__UMC = 0, 54 RAS_BLOCK_ID__SDMA, 55 RAS_BLOCK_ID__GFX, 56 RAS_BLOCK_ID__MMHUB, 57 RAS_BLOCK_ID__ATHUB, 58 RAS_BLOCK_ID__PCIE_BIF, 59 RAS_BLOCK_ID__HDP, 60 RAS_BLOCK_ID__XGMI_WAFL, 61 RAS_BLOCK_ID__DF, 62 RAS_BLOCK_ID__SMN, 63 RAS_BLOCK_ID__SEM, 64 RAS_BLOCK_ID__MP0, 65 RAS_BLOCK_ID__MP1, 66 RAS_BLOCK_ID__FUSE, 67 RAS_BLOCK_ID__MCA, 68 RAS_BLOCK_ID__VCN, 69 RAS_BLOCK_ID__JPEG, 70 RAS_BLOCK_ID__IH, 71 RAS_BLOCK_ID__MPIO, 72 73 RAS_BLOCK_ID__LAST 74 }; 75 76 enum ras_ecc_err_type { 77 RAS_ECC_ERR__NONE = 0, 78 RAS_ECC_ERR__PARITY = 1, 79 RAS_ECC_ERR__SINGLE_CORRECTABLE = 2, 80 RAS_ECC_ERR__MULTI_UNCORRECTABLE = 4, 81 RAS_ECC_ERR__POISON = 8, 82 }; 83 84 enum ras_err_type { 85 RAS_ERR_TYPE__UE = 0, 86 RAS_ERR_TYPE__CE, 87 RAS_ERR_TYPE__DE, 88 RAS_ERR_TYPE__LAST 89 }; 90 91 enum ras_seqno_type { 92 RAS_SEQNO_TYPE_INVALID = 0, 93 RAS_SEQNO_TYPE_UE, 94 RAS_SEQNO_TYPE_CE, 95 RAS_SEQNO_TYPE_DE, 96 RAS_SEQNO_TYPE_POISON_CONSUMPTION, 97 RAS_SEQNO_TYPE_COUNT_MAX, 98 }; 99 100 enum ras_seqno_fifo { 101 SEQNO_FIFO_INVALID = 0, 102 SEQNO_FIFO_POISON_CREATION, 103 SEQNO_FIFO_POISON_CONSUMPTION, 104 SEQNO_FIFO_COUNT_MAX 105 }; 106 107 enum ras_notify_event { 108 RAS_EVENT_ID__NONE, 109 RAS_EVENT_ID__BAD_PAGE_DETECTED, 110 RAS_EVENT_ID__POISON_CONSUMPTION, 111 RAS_EVENT_ID__RESERVE_BAD_PAGE, 112 RAS_EVENT_ID__DEVICE_RMA, 113 RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM, 114 RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP, 115 RAS_EVENT_ID__FATAL_ERROR_DETECTED, 116 RAS_EVENT_ID__RESET_GPU, 117 RAS_EVENT_ID__RESET_VF, 118 RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, 119 RAS_EVENT_ID__RAS_EVENT_PROC_END, 120 }; 121 122 enum ras_gpu_status { 123 RAS_GPU_STATUS__NOT_READY = 0, 124 RAS_GPU_STATUS__READY = 0x1, 125 RAS_GPU_STATUS__IN_RESET = 0x2, 126 RAS_GPU_STATUS__IS_RMA = 0x4, 127 RAS_GPU_STATUS__IS_VF = 0x8, 128 }; 129 130 struct ras_core_context; 131 struct ras_bank_ecc; 132 struct ras_umc; 133 struct ras_aca; 134 struct ras_process; 135 struct ras_nbio; 136 struct ras_log_ring; 137 struct ras_psp; 138 139 struct ras_mp1_sys_func { 140 int (*mp1_get_valid_bank_count)(struct ras_core_context *ras_core, 141 u32 msg, u32 *count); 142 int (*mp1_dump_valid_bank)(struct ras_core_context *ras_core, 143 u32 msg, u32 idx, u32 reg_idx, u64 *val); 144 }; 145 146 struct ras_eeprom_sys_func { 147 int (*eeprom_i2c_xfer)(struct ras_core_context *ras_core, 148 u32 eeprom_addr, u8 *eeprom_buf, u32 buf_size, bool read); 149 int (*update_eeprom_i2c_config)(struct ras_core_context *ras_core); 150 }; 151 152 struct ras_nbio_sys_func { 153 int (*set_ras_controller_irq_state)(struct ras_core_context *ras_core, 154 bool state); 155 int (*set_ras_err_event_athub_irq_state)(struct ras_core_context *ras_core, 156 bool state); 157 }; 158 159 struct ras_time { 160 int tm_sec; 161 int tm_min; 162 int tm_hour; 163 int tm_mday; 164 int tm_mon; 165 long tm_year; 166 }; 167 168 struct device_system_info { 169 uint32_t device_id; 170 uint32_t vendor_id; 171 uint32_t socket_id; 172 }; 173 174 enum gpu_mem_type { 175 GPU_MEM_TYPE_DEFAULT, 176 GPU_MEM_TYPE_RAS_PSP_RING, 177 GPU_MEM_TYPE_RAS_PSP_CMD, 178 GPU_MEM_TYPE_RAS_PSP_FENCE, 179 GPU_MEM_TYPE_RAS_TA_FW, 180 GPU_MEM_TYPE_RAS_TA_CMD, 181 }; 182 183 struct ras_psp_sys_func { 184 int (*get_ras_psp_system_status)(struct ras_core_context *ras_core, 185 struct ras_psp_sys_status *status); 186 int (*get_ras_ta_init_param)(struct ras_core_context *ras_core, 187 struct ras_ta_init_param *ras_ta_param); 188 }; 189 190 struct ras_sys_func { 191 int (*gpu_reset_lock)(struct ras_core_context *ras_core, 192 bool down, bool try); 193 int (*check_gpu_status)(struct ras_core_context *ras_core, 194 uint32_t *status); 195 int (*gen_seqno)(struct ras_core_context *ras_core, 196 enum ras_seqno_type seqno_type, uint64_t *seqno); 197 int (*async_handle_ras_event)(struct ras_core_context *ras_core, void *data); 198 int (*ras_notifier)(struct ras_core_context *ras_core, 199 enum ras_notify_event event_id, void *data); 200 u64 (*get_utc_second_timestamp)(struct ras_core_context *ras_core); 201 int (*get_device_system_info)(struct ras_core_context *ras_core, 202 struct device_system_info *dev_info); 203 bool (*detect_ras_interrupt)(struct ras_core_context *ras_core); 204 int (*get_gpu_mem)(struct ras_core_context *ras_core, 205 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); 206 int (*put_gpu_mem)(struct ras_core_context *ras_core, 207 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); 208 }; 209 210 struct ras_ecc_count { 211 uint64_t new_ce_count; 212 uint64_t total_ce_count; 213 uint64_t new_ue_count; 214 uint64_t total_ue_count; 215 uint64_t new_de_count; 216 uint64_t total_de_count; 217 }; 218 219 struct ras_bank_ecc { 220 uint32_t nps; 221 uint64_t seq_no; 222 uint64_t status; 223 uint64_t ipid; 224 uint64_t addr; 225 }; 226 227 struct ras_bank_ecc_node { 228 struct list_head node; 229 struct ras_bank_ecc ecc; 230 }; 231 232 struct ras_aca_config { 233 u32 socket_num_per_hive; 234 u32 aid_num_per_socket; 235 u32 xcd_num_per_aid; 236 }; 237 238 struct ras_mp1_config { 239 const struct ras_mp1_sys_func *mp1_sys_fn; 240 }; 241 242 struct ras_nbio_config { 243 const struct ras_nbio_sys_func *nbio_sys_fn; 244 }; 245 246 struct ras_psp_config { 247 const struct ras_psp_sys_func *psp_sys_fn; 248 }; 249 250 struct ras_umc_config { 251 uint32_t umc_vram_type; 252 }; 253 254 struct ras_eeprom_config { 255 const struct ras_eeprom_sys_func *eeprom_sys_fn; 256 int eeprom_record_threshold_config; 257 uint32_t eeprom_record_threshold_count; 258 void *eeprom_i2c_adapter; 259 u32 eeprom_i2c_addr; 260 u32 eeprom_i2c_port; 261 u16 max_i2c_read_len; 262 u16 max_i2c_write_len; 263 }; 264 265 struct ras_core_config { 266 u32 aca_ip_version; 267 u32 umc_ip_version; 268 u32 mp1_ip_version; 269 u32 gfx_ip_version; 270 u32 nbio_ip_version; 271 u32 psp_ip_version; 272 273 bool poison_supported; 274 bool ras_eeprom_supported; 275 const struct ras_sys_func *sys_fn; 276 277 struct ras_aca_config aca_cfg; 278 struct ras_mp1_config mp1_cfg; 279 struct ras_nbio_config nbio_cfg; 280 struct ras_psp_config psp_cfg; 281 struct ras_eeprom_config eeprom_cfg; 282 struct ras_umc_config umc_cfg; 283 }; 284 285 struct ras_core_context { 286 void *dev; 287 struct ras_core_config *config; 288 u32 socket_num_per_hive; 289 u32 aid_num_per_socket; 290 u32 xcd_num_per_aid; 291 int max_ue_banks_per_query; 292 int max_ce_banks_per_query; 293 struct ras_aca ras_aca; 294 295 bool ras_eeprom_supported; 296 struct ras_eeprom_control ras_eeprom; 297 298 struct ras_psp ras_psp; 299 struct ras_umc ras_umc; 300 struct ras_nbio ras_nbio; 301 struct ras_gfx ras_gfx; 302 struct ras_mp1 ras_mp1; 303 struct ras_process ras_proc; 304 struct ras_cmd_mgr ras_cmd; 305 struct ras_log_ring ras_log_ring; 306 307 const struct ras_sys_func *sys_fn; 308 309 /* is poison mode supported */ 310 bool poison_supported; 311 312 bool is_rma; 313 bool is_initialized; 314 315 struct kfifo de_seqno_fifo; 316 struct kfifo consumption_seqno_fifo; 317 spinlock_t seqno_lock; 318 319 bool ras_core_enabled; 320 }; 321 322 struct ras_core_context *ras_core_create(struct ras_core_config *init_config); 323 void ras_core_destroy(struct ras_core_context *ras_core); 324 int ras_core_sw_init(struct ras_core_context *ras_core); 325 int ras_core_sw_fini(struct ras_core_context *ras_core); 326 int ras_core_hw_init(struct ras_core_context *ras_core); 327 int ras_core_hw_fini(struct ras_core_context *ras_core); 328 bool ras_core_is_ready(struct ras_core_context *ras_core); 329 uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core, 330 enum ras_seqno_type seqno_type); 331 uint64_t ras_core_get_seqno(struct ras_core_context *ras_core, 332 enum ras_seqno_type seqno_type, bool pop); 333 334 int ras_core_put_seqno(struct ras_core_context *ras_core, 335 enum ras_seqno_type seqno_type, uint64_t seqno); 336 337 int ras_core_update_ecc_info(struct ras_core_context *ras_core); 338 int ras_core_query_block_ecc_data(struct ras_core_context *ras_core, 339 enum ras_block_id block, struct ras_ecc_count *ecc_count); 340 341 bool ras_core_gpu_in_reset(struct ras_core_context *ras_core); 342 bool ras_core_gpu_is_rma(struct ras_core_context *ras_core); 343 bool ras_core_gpu_is_vf(struct ras_core_context *ras_core); 344 bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data); 345 int ras_core_handle_fatal_error(struct ras_core_context *ras_core); 346 347 uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core); 348 const char *ras_core_get_ras_block_name(enum ras_block_id block_id); 349 int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core, 350 uint64_t timestamp, struct ras_time *tm); 351 352 int ras_core_set_status(struct ras_core_context *ras_core, bool enable); 353 bool ras_core_is_enabled(struct ras_core_context *ras_core); 354 uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core); 355 int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core, 356 uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa); 357 bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core); 358 int ras_core_get_gpu_mem(struct ras_core_context *ras_core, 359 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); 360 int ras_core_put_gpu_mem(struct ras_core_context *ras_core, 361 enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); 362 bool ras_core_check_safety_watermark(struct ras_core_context *ras_core); 363 int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core); 364 void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core); 365 void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core); 366 int ras_core_event_notify(struct ras_core_context *ras_core, 367 enum ras_notify_event event_id, void *data); 368 int ras_core_get_device_system_info(struct ras_core_context *ras_core, 369 struct device_system_info *dev_info); 370 #endif 371