xref: /linux/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h (revision 24f171c7e145f43b9f187578e89b0982ce87e54c)
1 /* SPDX-License-Identifier: MIT */
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #ifndef __RAS_CMD_H__
26 #define __RAS_CMD_H__
27 #include "ras.h"
28 #include "ras_eeprom.h"
29 #include "ras_log_ring.h"
30 #include "ras_cper.h"
31 
32 #define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL
33 
34 #define RAS_CMD_MAX_IN_SIZE 256
35 #define RAS_CMD_MAX_GPU_NUM 32
36 #define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32
37 
38 /* position of instance value in sub_block_index of
39  * ta_ras_trigger_error_input, the sub block uses lower 12 bits
40  */
41 #define RAS_TA_INST_MASK 0xfffff000
42 #define RAS_TA_INST_SHIFT 0xc
43 
44 enum ras_cmd_interface_type {
45 	RAS_CMD_INTERFACE_TYPE_NONE,
46 	RAS_CMD_INTERFACE_TYPE_AMDGPU,
47 	RAS_CMD_INTERFACE_TYPE_VF,
48 	RAS_CMD_INTERFACE_TYPE_PF,
49 };
50 
51 enum ras_cmd_id_range {
52 	RAS_CMD_ID_COMMON_START = 0,
53 	RAS_CMD_ID_COMMON_END = 0x10000,
54 	RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END,
55 	RAS_CMD_ID_AMDGPU_END = 0x20000,
56 	RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END,
57 	RAS_CMD_ID_MXGPU_END = 0x30000,
58 	RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END,
59 	RAS_CMD_ID_MXGPU_VF_END = 0x40000,
60 };
61 
62 enum ras_cmd_id {
63 	RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START,
64 	RAS_CMD__QUERY_INTERFACE_INFO,
65 	RAS_CMD__GET_DEVICES_INFO,
66 	RAS_CMD__GET_BLOCK_ECC_STATUS,
67 	RAS_CMD__INJECT_ERROR,
68 	RAS_CMD__GET_BAD_PAGES,
69 	RAS_CMD__CLEAR_BAD_PAGE_INFO,
70 	RAS_CMD__RESET_ALL_ERROR_COUNTS,
71 	RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES,
72 	RAS_CMD__TRANSLATE_FB_ADDRESS,
73 	RAS_CMD__GET_LINK_TOPOLOGY,
74 	RAS_CMD__GET_CPER_SNAPSHOT,
75 	RAS_CMD__GET_CPER_RECORD,
76 	RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
77 	RAS_CMD__GET_BATCH_TRACE_RECORD,
78 	RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
79 };
80 
81 enum ras_cmd_response {
82 	RAS_CMD__SUCCESS = 0,
83 	RAS_CMD__SUCCESS_EXEED_BUFFER,
84 	RAS_CMD__ERROR_UKNOWN_CMD,
85 	RAS_CMD__ERROR_INVALID_CMD,
86 	RAS_CMD__ERROR_VERSION,
87 	RAS_CMD__ERROR_INVALID_INPUT_SIZE,
88 	RAS_CMD__ERROR_INVALID_INPUT_DATA,
89 	RAS_CMD__ERROR_DRV_INIT_FAIL,
90 	RAS_CMD__ERROR_ACCESS_DENIED,
91 	RAS_CMD__ERROR_GENERIC,
92 	RAS_CMD__ERROR_TIMEOUT,
93 };
94 
95 enum ras_error_type {
96 	RAS_TYPE_ERROR__NONE = 0,
97 	RAS_TYPE_ERROR__PARITY = 1,
98 	RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2,
99 	RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4,
100 	RAS_TYPE_ERROR__POISON = 8,
101 };
102 
103 struct ras_core_context;
104 struct ras_cmd_ctx;
105 
106 struct ras_cmd_mgr {
107 	struct list_head head;
108 	struct ras_core_context *ras_core;
109 	uint64_t dev_handle;
110 };
111 
112 struct ras_cmd_func_map {
113 	uint32_t cmd_id;
114 	int (*func)(struct ras_core_context *ras_core,
115 			struct ras_cmd_ctx *cmd, void *data);
116 };
117 
118 struct ras_device_bdf {
119 	union {
120 		struct {
121 			uint32_t function : 3;
122 			uint32_t device : 5;
123 			uint32_t bus : 8;
124 			uint32_t domain : 16;
125 		};
126 		uint32_t u32_all;
127 	};
128 };
129 
130 struct ras_cmd_param {
131 	uint32_t idx_vf;
132 	void *data;
133 };
134 
135 #pragma pack(push, 8)
136 struct ras_cmd_ctx {
137 	uint32_t magic;
138 	union {
139 		struct {
140 			uint16_t ras_cmd_minor_ver : 10;
141 			uint16_t ras_cmd_major_ver : 6;
142 		};
143 		uint16_t ras_cmd_ver;
144 	};
145 	union {
146 		struct {
147 			uint16_t plat_major_ver : 10;
148 			uint16_t plat_minor_ver : 6;
149 		};
150 		uint16_t plat_ver;
151 	};
152 	uint32_t cmd_id;
153 	uint32_t cmd_res;
154 	uint32_t input_size;
155 	uint32_t output_size;
156 	uint32_t output_buf_size;
157 	uint32_t reserved[5];
158 	uint8_t  input_buff_raw[RAS_CMD_MAX_IN_SIZE];
159 	uint8_t  output_buff_raw[];
160 };
161 
162 struct ras_cmd_dev_handle {
163 	uint64_t dev_handle;
164 };
165 
166 struct ras_cmd_block_ecc_info_req {
167 	struct ras_cmd_dev_handle dev;
168 	uint32_t block_id;
169 	uint32_t subblock_id;
170 	uint32_t reserved[4];
171 };
172 
173 struct ras_cmd_block_ecc_info_rsp {
174 	uint32_t version;
175 	uint32_t ce_count;
176 	uint32_t ue_count;
177 	uint32_t de_count;
178 	uint32_t reserved[6];
179 };
180 
181 struct ras_cmd_inject_error_req {
182 	struct ras_cmd_dev_handle dev;
183 	uint32_t block_id;
184 	uint32_t subblock_id;
185 	uint64_t address;
186 	uint32_t error_type;
187 	uint32_t instance_mask;
188 	union {
189 		struct {
190 			/* vf index */
191 			uint64_t vf_idx : 6;
192 			/* method of error injection. i.e persistent, coherent etc */
193 			uint64_t method : 10;
194 			uint64_t rsv    : 48;
195 		};
196 		uint64_t value;
197 	};
198 	uint32_t reserved[8];
199 };
200 
201 struct ras_cmd_inject_error_rsp {
202 	uint32_t version;
203 	uint32_t reserved[5];
204 	uint64_t address;
205 };
206 
207 struct ras_cmd_dev_info {
208 	uint64_t dev_handle;
209 	uint32_t location_id;
210 	uint32_t ecc_enabled;
211 	uint32_t ecc_supported;
212 	uint32_t vf_num;
213 	uint32_t asic_type;
214 	uint32_t oam_id;
215 	uint32_t reserved[8];
216 };
217 
218 struct ras_cmd_devices_info_rsp {
219 	uint32_t version;
220 	uint32_t dev_num;
221 	uint32_t reserved[6];
222 	struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM];
223 };
224 
225 struct ras_cmd_bad_page_record {
226 	union {
227 		uint64_t address;
228 		uint64_t offset;
229 	};
230 	uint64_t retired_page;
231 	uint64_t ts;
232 
233 	uint32_t err_type;
234 
235 	union {
236 		unsigned char bank;
237 		unsigned char cu;
238 	};
239 
240 	unsigned char mem_channel;
241 	unsigned char mcumc_id;
242 
243 	unsigned char valid;
244 	unsigned char reserved[8];
245 };
246 
247 struct ras_cmd_bad_pages_info_req {
248 	struct ras_cmd_dev_handle device;
249 	uint32_t group_index;
250 	uint32_t reserved[5];
251 };
252 
253 struct ras_cmd_bad_pages_info_rsp {
254 	uint32_t version;
255 	uint32_t group_index;
256 	uint32_t bp_in_group;
257 	uint32_t bp_total_cnt;
258 	uint32_t reserved[4];
259 	struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
260 };
261 
262 struct ras_query_interface_info_req {
263 	uint32_t reserved[8];
264 };
265 
266 struct ras_query_interface_info_rsp {
267 	uint32_t version;
268 	uint32_t ras_cmd_major_ver;
269 	uint32_t ras_cmd_minor_ver;
270 	uint32_t plat_major_ver;
271 	uint32_t plat_minor_ver;
272 	uint8_t  interface_type;
273 	uint8_t  rsv[3];
274 	uint32_t reserved[8];
275 };
276 
277 #define RAS_MAX_NUM_SAFE_RANGES 64
278 struct ras_cmd_ras_safe_fb_address_ranges_rsp {
279 	uint32_t version;
280 	uint32_t num_ranges;
281 	uint32_t reserved[4];
282 	struct {
283 		uint64_t start;
284 		uint64_t size;
285 		uint32_t idx;
286 		uint32_t reserved[3];
287 	} range[RAS_MAX_NUM_SAFE_RANGES];
288 };
289 
290 enum ras_fb_addr_type {
291 	RAS_FB_ADDR_SOC_PHY, /* SPA */
292 	RAS_FB_ADDR_BANK,
293 	RAS_FB_ADDR_VF_PHY, /* GPA */
294 	RAS_FB_ADDR_UNKNOWN
295 };
296 
297 struct ras_fb_bank_addr {
298 	uint32_t stack_id; /* SID */
299 	uint32_t bank_group;
300 	uint32_t bank;
301 	uint32_t row;
302 	uint32_t column;
303 	uint32_t channel;
304 	uint32_t subchannel; /* Also called Pseudochannel (PC) */
305 	uint32_t reserved[3];
306 };
307 
308 struct ras_fb_vf_phy_addr {
309 	uint32_t vf_idx;
310 	uint32_t reserved;
311 	uint64_t addr;
312 };
313 
314 union ras_translate_fb_address {
315 	struct ras_fb_bank_addr bank_addr;
316 	uint64_t soc_phy_addr;
317 	struct ras_fb_vf_phy_addr vf_phy_addr;
318 };
319 
320 struct ras_cmd_translate_fb_address_req {
321 	struct ras_cmd_dev_handle dev;
322 	enum ras_fb_addr_type src_addr_type;
323 	enum ras_fb_addr_type dest_addr_type;
324 	union ras_translate_fb_address trans_addr;
325 };
326 
327 struct ras_cmd_translate_fb_address_rsp {
328 	uint32_t version;
329 	uint32_t reserved[5];
330 	union ras_translate_fb_address trans_addr;
331 };
332 
333 struct ras_dev_link_topology_req {
334 	struct ras_cmd_dev_handle src;
335 	struct ras_cmd_dev_handle dst;
336 };
337 
338 struct ras_dev_link_topology_rsp {
339 	uint32_t  version;
340 	uint32_t  link_status;  /* HW status of the link */
341 	uint32_t  link_type;    /* type of the link */
342 	uint32_t  num_hops;     /* number of hops */
343 	uint32_t reserved[8];
344 };
345 
346 struct ras_cmd_cper_snapshot_req {
347 	struct ras_cmd_dev_handle dev;
348 };
349 
350 struct ras_cmd_cper_snapshot_rsp {
351 	uint32_t version;
352 	uint32_t reserved[4];
353 	uint32_t total_cper_num;
354 	uint64_t start_cper_id;
355 	uint64_t latest_cper_id;
356 };
357 
358 struct ras_cmd_cper_record_req {
359 	struct ras_cmd_dev_handle dev;
360 	uint64_t cper_start_id;
361 	uint32_t cper_num;
362 	uint32_t buf_size;
363 	uint64_t buf_ptr;
364 	uint32_t reserved[4];
365 };
366 
367 struct ras_cmd_cper_record_rsp {
368 	uint32_t version;
369 	uint32_t real_data_size;
370 	uint32_t real_cper_num;
371 	uint32_t remain_num;
372 	uint32_t reserved[4];
373 };
374 
375 struct ras_cmd_batch_trace_snapshot_req {
376 	struct ras_cmd_dev_handle dev;
377 };
378 
379 struct ras_cmd_batch_trace_snapshot_rsp {
380 	uint32_t version;
381 	uint32_t reserved[4];
382 	uint32_t total_batch_num;
383 	uint64_t start_batch_id;
384 	uint64_t latest_batch_id;
385 };
386 
387 struct ras_cmd_batch_trace_record_req {
388 	struct ras_cmd_dev_handle dev;
389 	uint64_t start_batch_id;
390 	uint32_t batch_num;
391 	uint32_t reserved[5];
392 };
393 
394 struct batch_ras_trace_info {
395 	uint64_t batch_id;
396 	uint16_t offset;
397 	uint8_t  trace_num;
398 	uint8_t  rsv;
399 	uint32_t reserved;
400 };
401 
402 #define RAS_CMD_MAX_BATCH_NUM  300
403 #define RAS_CMD_MAX_TRACE_NUM  300
404 struct ras_cmd_batch_trace_record_rsp {
405 	uint32_t version;
406 	uint16_t real_batch_num;
407 	uint16_t remain_num;
408 	uint64_t start_batch_id;
409 	uint32_t reserved[2];
410 	struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM];
411 	struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
412 };
413 
414 #pragma pack(pop)
415 
416 int ras_cmd_init(struct ras_core_context *ras_core);
417 int ras_cmd_fini(struct ras_core_context *ras_core);
418 int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data);
419 uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core);
420 int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
421 	struct ras_query_interface_info_rsp *rsp);
422 int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
423 			uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr);
424 int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
425 			struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa);
426 #endif
427