1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/pci.h>
26 #include "amdgpu.h"
27 #include "amdgpu_ras.h"
28 #include "ras_sys.h"
29 #include "amdgpu_ras_cmd.h"
30 #include "amdgpu_virt_ras_cmd.h"
31 #include "amdgpu_ras_mgr.h"
32
amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * output_data,uint32_t output_size)33 static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
34 struct ras_cmd_ctx *cmd, void *output_data, uint32_t output_size)
35 {
36 struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
37 uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, AMDGPU_GPU_PAGE_SIZE);
38 struct ras_cmd_ctx *rcmd;
39 struct amdgpu_bo *rcmd_bo = NULL;
40 uint64_t mc_addr = 0;
41 void *cpu_addr = NULL;
42 int ret = 0;
43
44 ret = amdgpu_bo_create_kernel(adev, mem_len, PAGE_SIZE,
45 AMDGPU_GEM_DOMAIN_VRAM, &rcmd_bo, &mc_addr, (void **)&cpu_addr);
46 if (ret)
47 return ret;
48
49 rcmd = (struct ras_cmd_ctx *)cpu_addr;
50 memset(rcmd, 0, mem_len);
51 memcpy(rcmd, cmd, sizeof(*cmd));
52
53 ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev,
54 mc_addr - adev->gmc.vram_start, mem_len);
55 if (!ret) {
56 if (rcmd->cmd_res) {
57 ret = rcmd->cmd_res;
58 goto out;
59 }
60
61 cmd->cmd_res = rcmd->cmd_res;
62 cmd->output_size = rcmd->output_size;
63 if (rcmd->output_size && (rcmd->output_size <= output_size) && output_data)
64 memcpy(output_data, rcmd->output_buff_raw, rcmd->output_size);
65 }
66
67 out:
68 amdgpu_bo_free_kernel(&rcmd_bo, &mc_addr, &cpu_addr);
69
70 return ret;
71 }
72
amdgpu_virt_ras_send_remote_cmd(struct ras_core_context * ras_core,uint32_t cmd_id,void * input_data,uint32_t input_size,void * output_data,uint32_t output_size)73 static int amdgpu_virt_ras_send_remote_cmd(struct ras_core_context *ras_core,
74 uint32_t cmd_id, void *input_data, uint32_t input_size,
75 void *output_data, uint32_t output_size)
76 {
77 struct ras_cmd_ctx rcmd = {0};
78 int ret;
79
80 rcmd.cmd_id = cmd_id;
81 rcmd.input_size = input_size;
82 memcpy(rcmd.input_buff_raw, input_data, input_size);
83
84 ret = amdgpu_virt_ras_remote_ioctl_cmd(ras_core,
85 &rcmd, output_data, output_size);
86 if (!ret) {
87 if (rcmd.output_size != output_size)
88 return RAS_CMD__ERROR_GENERIC;
89 }
90
91 return ret;
92 }
93
amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context * ras_core,struct ras_log_batch_overview * overview)94 static int amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context *ras_core,
95 struct ras_log_batch_overview *overview)
96 {
97 struct ras_cmd_batch_trace_snapshot_req req = {0};
98 struct ras_cmd_batch_trace_snapshot_rsp rsp = {0};
99 int ret;
100
101 ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
102 &req, sizeof(req), &rsp, sizeof(rsp));
103 if (ret)
104 return ret;
105
106 overview->first_batch_id = rsp.start_batch_id;
107 overview->last_batch_id = rsp.latest_batch_id;
108 overview->logged_batch_count = rsp.total_batch_num;
109
110 return RAS_CMD__SUCCESS;
111 }
112
amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)113 static int amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context *ras_core,
114 struct ras_cmd_ctx *cmd, void *data)
115 {
116 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
117 struct amdgpu_virt_ras_cmd *virt_ras =
118 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
119 int ret;
120
121 if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
122 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
123
124 ret = amdgpu_virt_ras_send_remote_cmd(ras_core, cmd->cmd_id,
125 cmd->input_buff_raw, cmd->input_size,
126 cmd->output_buff_raw, sizeof(struct ras_cmd_cper_snapshot_rsp));
127 if (ret)
128 return ret;
129
130 memset(&virt_ras->batch_mgr, 0, sizeof(virt_ras->batch_mgr));
131 amdgpu_virt_ras_get_batch_trace_overview(ras_core,
132 &virt_ras->batch_mgr.batch_overview);
133
134 cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
135 return RAS_CMD__SUCCESS;
136 }
137
amdgpu_virt_ras_get_batch_records(struct ras_core_context * ras_core,uint64_t batch_id,struct ras_log_info ** trace_arr,uint32_t arr_num,struct ras_cmd_batch_trace_record_rsp * rsp_cache)138 static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
139 struct ras_log_info **trace_arr, uint32_t arr_num,
140 struct ras_cmd_batch_trace_record_rsp *rsp_cache)
141 {
142 struct ras_cmd_batch_trace_record_req req = {
143 .start_batch_id = batch_id,
144 .batch_num = RAS_CMD_MAX_BATCH_NUM,
145 };
146 struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache;
147 struct batch_ras_trace_info *batch;
148 int ret = 0;
149 uint8_t i;
150
151 if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) ||
152 (batch_id >= (rsp->start_batch_id + rsp->real_batch_num))) {
153
154 memset(rsp, 0, sizeof(*rsp));
155 ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_RECORD,
156 &req, sizeof(req), rsp, sizeof(*rsp));
157 if (ret)
158 return -EPIPE;
159 }
160
161 batch = &rsp->batchs[batch_id - rsp->start_batch_id];
162 if (batch_id != batch->batch_id)
163 return -ENODATA;
164
165 for (i = 0; i < batch->trace_num; i++) {
166 if (i >= arr_num)
167 break;
168 trace_arr[i] = &rsp->records[batch->offset + i];
169 }
170
171 return i;
172 }
173
amdgpu_virt_ras_get_cper_records(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)174 static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
175 struct ras_cmd_ctx *cmd, void *data)
176 {
177 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
178 struct amdgpu_virt_ras_cmd *virt_ras =
179 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
180 struct ras_cmd_cper_record_req *req =
181 (struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
182 struct ras_cmd_cper_record_rsp *rsp =
183 (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
184 struct ras_log_batch_overview *overview = &virt_ras->batch_mgr.batch_overview;
185 struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras->batch_mgr.batch_trace;
186 struct ras_log_info **trace;
187 uint32_t offset = 0, real_data_len = 0;
188 uint64_t batch_id;
189 uint8_t *out_buf;
190 int ret = 0, i, count;
191
192 if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
193 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
194
195 if (!req->buf_size || !req->buf_ptr || !req->cper_num)
196 return RAS_CMD__ERROR_INVALID_INPUT_DATA;
197
198 trace = kzalloc_objs(*trace, MAX_RECORD_PER_BATCH);
199 if (!trace)
200 return RAS_CMD__ERROR_GENERIC;
201
202 out_buf = kzalloc(req->buf_size, GFP_KERNEL);
203 if (!out_buf) {
204 kfree(trace);
205 return RAS_CMD__ERROR_GENERIC;
206 }
207
208 memset(out_buf, 0, req->buf_size);
209
210 for (i = 0; i < req->cper_num; i++) {
211 batch_id = req->cper_start_id + i;
212 if (batch_id >= overview->last_batch_id)
213 break;
214 count = amdgpu_virt_ras_get_batch_records(ras_core, batch_id,
215 trace, MAX_RECORD_PER_BATCH,
216 rsp_cache);
217 if (count > 0) {
218 ret = ras_cper_generate_cper(ras_core, trace, count,
219 &out_buf[offset], req->buf_size - offset, &real_data_len);
220 if (ret)
221 break;
222
223 offset += real_data_len;
224 }
225 }
226
227 if ((ret && (ret != -ENOMEM)) ||
228 copy_to_user(u64_to_user_ptr(req->buf_ptr), out_buf, offset)) {
229 kfree(out_buf);
230 kfree(trace);
231 return RAS_CMD__ERROR_GENERIC;
232 }
233
234 rsp->real_data_size = offset;
235 rsp->real_cper_num = i;
236 rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
237 rsp->version = 0;
238
239 cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
240
241 kfree(out_buf);
242 kfree(trace);
243
244 return RAS_CMD__SUCCESS;
245 }
246
__fill_get_blocks_ecc_cmd(struct amdgpu_device * adev,struct vram_blocks_ecc * blks_ecc)247 static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
248 struct vram_blocks_ecc *blks_ecc)
249 {
250 struct ras_cmd_ctx *rcmd;
251
252 if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr)
253 return -EINVAL;
254
255 rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr;
256
257 rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
258 rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
259 rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd);
260
261 return 0;
262 }
263
__set_cmd_auto_update(struct amdgpu_device * adev,enum ras_cmd_id cmd_id,uint64_t gpa_addr,uint32_t len,bool reg)264 static int __set_cmd_auto_update(struct amdgpu_device *adev,
265 enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
266 {
267 struct ras_cmd_auto_update_req req = {0};
268 struct ras_cmd_auto_update_rsp rsp = {0};
269 int ret;
270
271 req.mode = reg ? 1 : 0;
272 req.cmd_id = cmd_id;
273 req.addr = gpa_addr;
274 req.len = len;
275 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
276 &req, sizeof(req), &rsp, sizeof(rsp));
277
278 return ret;
279 }
280
amdgpu_virt_ras_get_block_ecc(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)281 static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
282 struct ras_cmd_ctx *cmd, void *data)
283 {
284 struct amdgpu_device *adev = ras_core->dev;
285 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
286 struct amdgpu_virt_ras_cmd *virt_ras =
287 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
288 struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
289 struct ras_cmd_ctx *blks_ecc_cmd_ctx;
290 struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
291 struct ras_cmd_block_ecc_info_req *input_data =
292 (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
293 struct ras_cmd_block_ecc_info_rsp *output_data =
294 (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
295 int ret = 0;
296
297 if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
298 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
299
300 if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
301 return RAS_CMD__ERROR_INVALID_INPUT_DATA;
302
303 if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
304 return RAS_CMD__ERROR_GENERIC;
305
306 if (!virt_ras->blocks_ecc.auto_update_actived) {
307 ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
308 blks_ecc->mc_addr - adev->gmc.vram_start,
309 blks_ecc->size, true);
310 if (ret)
311 return ret;
312
313 blks_ecc->auto_update_actived = true;
314 }
315
316 blks_ecc_cmd_ctx = blks_ecc->cpu_addr;
317 blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
318
319 output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
320 output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
321 output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;
322
323 cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
324 return RAS_CMD__SUCCESS;
325 }
326
327 static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
328 {RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
329 {RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
330 {RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
331 };
332
amdgpu_virt_ras_handle_cmd(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd)333 int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
334 struct ras_cmd_ctx *cmd)
335 {
336 struct ras_cmd_func_map *ras_cmd = NULL;
337 int i, res;
338
339 for (i = 0; i < ARRAY_SIZE(amdgpu_virt_ras_cmd_maps); i++) {
340 if (cmd->cmd_id == amdgpu_virt_ras_cmd_maps[i].cmd_id) {
341 ras_cmd = &amdgpu_virt_ras_cmd_maps[i];
342 break;
343 }
344 }
345
346 if (ras_cmd)
347 res = ras_cmd->func(ras_core, cmd, NULL);
348 else
349 res = amdgpu_virt_ras_remote_ioctl_cmd(ras_core, cmd,
350 cmd->output_buff_raw, cmd->output_buf_size);
351
352 cmd->cmd_res = res;
353
354 if (cmd->output_size > cmd->output_buf_size) {
355 RAS_DEV_ERR(ras_core->dev,
356 "Output data size 0x%x exceeds buffer size 0x%x!\n",
357 cmd->output_size, cmd->output_buf_size);
358 return RAS_CMD__SUCCESS_EXEED_BUFFER;
359 }
360
361 return RAS_CMD__SUCCESS;
362 }
363
amdgpu_virt_ras_sw_init(struct amdgpu_device * adev)364 int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev)
365 {
366 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
367
368 ras_mgr->virt_ras_cmd = kzalloc_obj(struct amdgpu_virt_ras_cmd);
369 if (!ras_mgr->virt_ras_cmd)
370 return -ENOMEM;
371
372 return 0;
373 }
374
amdgpu_virt_ras_sw_fini(struct amdgpu_device * adev)375 int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
376 {
377 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
378
379 kfree(ras_mgr->virt_ras_cmd);
380 ras_mgr->virt_ras_cmd = NULL;
381
382 return 0;
383 }
384
amdgpu_virt_ras_hw_init(struct amdgpu_device * adev)385 int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
386 {
387 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
388 struct amdgpu_virt_ras_cmd *virt_ras =
389 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
390 struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
391
392 amdgpu_virt_get_ras_capability(adev);
393
394 memset(blks_ecc, 0, sizeof(*blks_ecc));
395 blks_ecc->size = PAGE_SIZE;
396 if (amdgpu_bo_create_kernel(adev, blks_ecc->size,
397 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
398 &blks_ecc->bo, &blks_ecc->mc_addr,
399 (void **)&blks_ecc->cpu_addr))
400 return -ENOMEM;
401
402 return 0;
403 }
404
amdgpu_virt_ras_hw_fini(struct amdgpu_device * adev)405 int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
406 {
407 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
408 struct amdgpu_virt_ras_cmd *virt_ras =
409 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
410 struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
411
412 if (blks_ecc->bo) {
413 __set_cmd_auto_update(adev,
414 RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
415 blks_ecc->mc_addr - adev->gmc.vram_start,
416 blks_ecc->size, false);
417
418 memset(blks_ecc->cpu_addr, 0, blks_ecc->size);
419 amdgpu_bo_free_kernel(&blks_ecc->bo,
420 &blks_ecc->mc_addr, &blks_ecc->cpu_addr);
421
422 memset(blks_ecc, 0, sizeof(*blks_ecc));
423 }
424
425 return 0;
426 }
427
amdgpu_virt_ras_pre_reset(struct amdgpu_device * adev)428 int amdgpu_virt_ras_pre_reset(struct amdgpu_device *adev)
429 {
430 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
431 struct amdgpu_virt_ras_cmd *virt_ras =
432 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
433
434 virt_ras->blocks_ecc.auto_update_actived = false;
435 return 0;
436 }
437
amdgpu_virt_ras_post_reset(struct amdgpu_device * adev)438 int amdgpu_virt_ras_post_reset(struct amdgpu_device *adev)
439 {
440 return 0;
441 }
442
amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device * adev,bool en)443 void amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device *adev, bool en)
444 {
445 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
446 struct amdgpu_virt_ras_cmd *virt_ras;
447
448 if (!ras_mgr || !ras_mgr->virt_ras_cmd)
449 return;
450
451 virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
452 virt_ras->remote_uniras_supported = en;
453 }
454
amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device * adev)455 bool amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device *adev)
456 {
457 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
458 struct amdgpu_virt_ras_cmd *virt_ras;
459
460 if (amdgpu_in_reset(adev))
461 return false;
462
463 if (!ras_mgr || !ras_mgr->virt_ras_cmd)
464 return false;
465
466 virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
467
468 return virt_ras->remote_uniras_supported;
469 }
470