1 /* SPDX-License-Identifier: MIT */ 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #ifndef __RAS_CMD_H__ 26 #define __RAS_CMD_H__ 27 #include "ras.h" 28 #include "ras_eeprom.h" 29 #include "ras_log_ring.h" 30 #include "ras_cper.h" 31 32 #define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL 33 34 #define RAS_CMD_MAX_IN_SIZE 256 35 #define RAS_CMD_MAX_GPU_NUM 32 36 #define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32 37 38 /* position of instance value in sub_block_index of 39 * ta_ras_trigger_error_input, the sub block uses lower 12 bits 40 */ 41 #define RAS_TA_INST_MASK 0xfffff000 42 #define RAS_TA_INST_SHIFT 0xc 43 44 enum ras_cmd_interface_type { 45 RAS_CMD_INTERFACE_TYPE_NONE, 46 RAS_CMD_INTERFACE_TYPE_AMDGPU, 47 RAS_CMD_INTERFACE_TYPE_VF, 48 RAS_CMD_INTERFACE_TYPE_PF, 49 }; 50 51 enum ras_cmd_id_range { 52 RAS_CMD_ID_COMMON_START = 0, 53 RAS_CMD_ID_COMMON_END = 0x10000, 54 RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END, 55 RAS_CMD_ID_AMDGPU_END = 0x20000, 56 RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END, 57 RAS_CMD_ID_MXGPU_END = 0x30000, 58 RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END, 59 RAS_CMD_ID_MXGPU_VF_END = 0x40000, 60 }; 61 62 enum ras_cmd_id { 63 RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START, 64 RAS_CMD__QUERY_INTERFACE_INFO, 65 RAS_CMD__GET_DEVICES_INFO, 66 RAS_CMD__GET_BLOCK_ECC_STATUS, 67 RAS_CMD__INJECT_ERROR, 68 RAS_CMD__GET_BAD_PAGES, 69 RAS_CMD__CLEAR_BAD_PAGE_INFO, 70 RAS_CMD__RESET_ALL_ERROR_COUNTS, 71 RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES, 72 RAS_CMD__TRANSLATE_FB_ADDRESS, 73 RAS_CMD__GET_LINK_TOPOLOGY, 74 RAS_CMD__GET_CPER_SNAPSHOT, 75 RAS_CMD__GET_CPER_RECORD, 76 RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, 77 RAS_CMD__GET_BATCH_TRACE_RECORD, 78 RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END, 79 }; 80 81 enum ras_cmd_response { 82 RAS_CMD__SUCCESS = 0, 83 RAS_CMD__SUCCESS_EXEED_BUFFER, 84 RAS_CMD__ERROR_UKNOWN_CMD, 85 RAS_CMD__ERROR_INVALID_CMD, 86 RAS_CMD__ERROR_VERSION, 87 RAS_CMD__ERROR_INVALID_INPUT_SIZE, 88 RAS_CMD__ERROR_INVALID_INPUT_DATA, 89 RAS_CMD__ERROR_DRV_INIT_FAIL, 90 RAS_CMD__ERROR_ACCESS_DENIED, 91 RAS_CMD__ERROR_GENERIC, 92 RAS_CMD__ERROR_TIMEOUT, 93 }; 94 95 enum ras_error_type { 96 RAS_TYPE_ERROR__NONE = 0, 97 RAS_TYPE_ERROR__PARITY = 1, 98 RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2, 99 RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4, 100 RAS_TYPE_ERROR__POISON = 8, 101 }; 102 103 struct ras_core_context; 104 struct ras_cmd_ctx; 105 106 struct ras_cmd_mgr { 107 struct list_head head; 108 struct ras_core_context *ras_core; 109 uint64_t dev_handle; 110 }; 111 112 struct ras_cmd_func_map { 113 uint32_t cmd_id; 114 int (*func)(struct ras_core_context *ras_core, 115 struct ras_cmd_ctx *cmd, void *data); 116 }; 117 118 struct ras_device_bdf { 119 union { 120 struct { 121 uint32_t function : 3; 122 uint32_t device : 5; 123 uint32_t bus : 8; 124 uint32_t domain : 16; 125 }; 126 uint32_t u32_all; 127 }; 128 }; 129 130 struct ras_cmd_param { 131 uint32_t idx_vf; 132 void *data; 133 }; 134 135 #pragma pack(push, 8) 136 struct ras_cmd_ctx { 137 uint32_t magic; 138 union { 139 struct { 140 uint16_t ras_cmd_minor_ver : 10; 141 uint16_t ras_cmd_major_ver : 6; 142 }; 143 uint16_t ras_cmd_ver; 144 }; 145 union { 146 struct { 147 uint16_t plat_major_ver : 10; 148 uint16_t plat_minor_ver : 6; 149 }; 150 uint16_t plat_ver; 151 }; 152 uint32_t cmd_id; 153 uint32_t cmd_res; 154 uint32_t input_size; 155 uint32_t output_size; 156 uint32_t output_buf_size; 157 uint32_t reserved[5]; 158 uint8_t input_buff_raw[RAS_CMD_MAX_IN_SIZE]; 159 uint8_t output_buff_raw[]; 160 }; 161 162 struct ras_cmd_dev_handle { 163 uint64_t dev_handle; 164 }; 165 166 struct ras_cmd_block_ecc_info_req { 167 struct ras_cmd_dev_handle dev; 168 uint32_t block_id; 169 uint32_t subblock_id; 170 uint32_t reserved[4]; 171 }; 172 173 struct ras_cmd_block_ecc_info_rsp { 174 uint32_t version; 175 uint32_t ce_count; 176 uint32_t ue_count; 177 uint32_t de_count; 178 uint32_t reserved[6]; 179 }; 180 181 struct ras_cmd_inject_error_req { 182 struct ras_cmd_dev_handle dev; 183 uint32_t block_id; 184 uint32_t subblock_id; 185 uint64_t address; 186 uint32_t error_type; 187 uint32_t instance_mask; 188 union { 189 struct { 190 /* vf index */ 191 uint64_t vf_idx : 6; 192 /* method of error injection. i.e persistent, coherent etc */ 193 uint64_t method : 10; 194 uint64_t rsv : 48; 195 }; 196 uint64_t value; 197 }; 198 uint32_t reserved[8]; 199 }; 200 201 struct ras_cmd_inject_error_rsp { 202 uint32_t version; 203 uint32_t reserved[5]; 204 uint64_t address; 205 }; 206 207 struct ras_cmd_dev_info { 208 uint64_t dev_handle; 209 uint32_t location_id; 210 uint32_t ecc_enabled; 211 uint32_t ecc_supported; 212 uint32_t vf_num; 213 uint32_t asic_type; 214 uint32_t oam_id; 215 uint32_t reserved[8]; 216 }; 217 218 struct ras_cmd_devices_info_rsp { 219 uint32_t version; 220 uint32_t dev_num; 221 uint32_t reserved[6]; 222 struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM]; 223 }; 224 225 struct ras_cmd_bad_page_record { 226 union { 227 uint64_t address; 228 uint64_t offset; 229 }; 230 uint64_t retired_page; 231 uint64_t ts; 232 233 uint32_t err_type; 234 235 union { 236 unsigned char bank; 237 unsigned char cu; 238 }; 239 240 unsigned char mem_channel; 241 unsigned char mcumc_id; 242 243 unsigned char valid; 244 unsigned char reserved[8]; 245 }; 246 247 struct ras_cmd_bad_pages_info_req { 248 struct ras_cmd_dev_handle device; 249 uint32_t group_index; 250 uint32_t reserved[5]; 251 }; 252 253 struct ras_cmd_bad_pages_info_rsp { 254 uint32_t version; 255 uint32_t group_index; 256 uint32_t bp_in_group; 257 uint32_t bp_total_cnt; 258 uint32_t reserved[4]; 259 struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP]; 260 }; 261 262 struct ras_query_interface_info_req { 263 uint32_t reserved[8]; 264 }; 265 266 struct ras_query_interface_info_rsp { 267 uint32_t version; 268 uint32_t ras_cmd_major_ver; 269 uint32_t ras_cmd_minor_ver; 270 uint32_t plat_major_ver; 271 uint32_t plat_minor_ver; 272 uint8_t interface_type; 273 uint8_t rsv[3]; 274 uint32_t reserved[8]; 275 }; 276 277 #define RAS_MAX_NUM_SAFE_RANGES 64 278 struct ras_cmd_ras_safe_fb_address_ranges_rsp { 279 uint32_t version; 280 uint32_t num_ranges; 281 uint32_t reserved[4]; 282 struct { 283 uint64_t start; 284 uint64_t size; 285 uint32_t idx; 286 uint32_t reserved[3]; 287 } range[RAS_MAX_NUM_SAFE_RANGES]; 288 }; 289 290 enum ras_fb_addr_type { 291 RAS_FB_ADDR_SOC_PHY, /* SPA */ 292 RAS_FB_ADDR_BANK, 293 RAS_FB_ADDR_VF_PHY, /* GPA */ 294 RAS_FB_ADDR_UNKNOWN 295 }; 296 297 struct ras_fb_bank_addr { 298 uint32_t stack_id; /* SID */ 299 uint32_t bank_group; 300 uint32_t bank; 301 uint32_t row; 302 uint32_t column; 303 uint32_t channel; 304 uint32_t subchannel; /* Also called Pseudochannel (PC) */ 305 uint32_t reserved[3]; 306 }; 307 308 struct ras_fb_vf_phy_addr { 309 uint32_t vf_idx; 310 uint32_t reserved; 311 uint64_t addr; 312 }; 313 314 union ras_translate_fb_address { 315 struct ras_fb_bank_addr bank_addr; 316 uint64_t soc_phy_addr; 317 struct ras_fb_vf_phy_addr vf_phy_addr; 318 }; 319 320 struct ras_cmd_translate_fb_address_req { 321 struct ras_cmd_dev_handle dev; 322 enum ras_fb_addr_type src_addr_type; 323 enum ras_fb_addr_type dest_addr_type; 324 union ras_translate_fb_address trans_addr; 325 }; 326 327 struct ras_cmd_translate_fb_address_rsp { 328 uint32_t version; 329 uint32_t reserved[5]; 330 union ras_translate_fb_address trans_addr; 331 }; 332 333 struct ras_dev_link_topology_req { 334 struct ras_cmd_dev_handle src; 335 struct ras_cmd_dev_handle dst; 336 }; 337 338 struct ras_dev_link_topology_rsp { 339 uint32_t version; 340 uint32_t link_status; /* HW status of the link */ 341 uint32_t link_type; /* type of the link */ 342 uint32_t num_hops; /* number of hops */ 343 uint32_t reserved[8]; 344 }; 345 346 struct ras_cmd_cper_snapshot_req { 347 struct ras_cmd_dev_handle dev; 348 }; 349 350 struct ras_cmd_cper_snapshot_rsp { 351 uint32_t version; 352 uint32_t reserved[4]; 353 uint32_t total_cper_num; 354 uint64_t start_cper_id; 355 uint64_t latest_cper_id; 356 }; 357 358 struct ras_cmd_cper_record_req { 359 struct ras_cmd_dev_handle dev; 360 uint64_t cper_start_id; 361 uint32_t cper_num; 362 uint32_t buf_size; 363 uint64_t buf_ptr; 364 uint32_t reserved[4]; 365 }; 366 367 struct ras_cmd_cper_record_rsp { 368 uint32_t version; 369 uint32_t real_data_size; 370 uint32_t real_cper_num; 371 uint32_t remain_num; 372 uint32_t reserved[4]; 373 }; 374 375 struct ras_cmd_batch_trace_snapshot_req { 376 struct ras_cmd_dev_handle dev; 377 }; 378 379 struct ras_cmd_batch_trace_snapshot_rsp { 380 uint32_t version; 381 uint32_t reserved[4]; 382 uint32_t total_batch_num; 383 uint64_t start_batch_id; 384 uint64_t latest_batch_id; 385 }; 386 387 struct ras_cmd_batch_trace_record_req { 388 struct ras_cmd_dev_handle dev; 389 uint64_t start_batch_id; 390 uint32_t batch_num; 391 uint32_t reserved[5]; 392 }; 393 394 struct batch_ras_trace_info { 395 uint64_t batch_id; 396 uint16_t offset; 397 uint8_t trace_num; 398 uint8_t rsv; 399 uint32_t reserved; 400 }; 401 402 #define RAS_CMD_MAX_BATCH_NUM 300 403 #define RAS_CMD_MAX_TRACE_NUM 300 404 struct ras_cmd_batch_trace_record_rsp { 405 uint32_t version; 406 uint16_t real_batch_num; 407 uint16_t remain_num; 408 uint64_t start_batch_id; 409 uint32_t reserved[2]; 410 struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM]; 411 struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM]; 412 }; 413 414 #pragma pack(pop) 415 416 int ras_cmd_init(struct ras_core_context *ras_core); 417 int ras_cmd_fini(struct ras_core_context *ras_core); 418 int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data); 419 uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core); 420 int ras_cmd_query_interface_info(struct ras_core_context *ras_core, 421 struct ras_query_interface_info_rsp *rsp); 422 int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core, 423 uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr); 424 int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core, 425 struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa); 426 #endif 427