1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright 2025 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/pci.h>
26 #include "amdgpu.h"
27 #include "amdgpu_ras.h"
28 #include "ras_sys.h"
29 #include "amdgpu_ras_cmd.h"
30 #include "amdgpu_virt_ras_cmd.h"
31 #include "amdgpu_ras_mgr.h"
32
amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context * ras_core,uint32_t cmd,uint32_t mem_size,struct amdgpu_virt_shared_mem * shared_mem)33 static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context *ras_core,
34 uint32_t cmd, uint32_t mem_size, struct amdgpu_virt_shared_mem *shared_mem)
35 {
36 struct amdgpu_device *adev = ras_core->dev;
37 struct amdsriov_ras_telemetry *ras_telemetry_cpu;
38 struct amdsriov_ras_telemetry *ras_telemetry_gpu;
39 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr;
40 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr;
41 uint64_t fw_vram_usage_start_offset = 0;
42 uint64_t ras_telemetry_offset = 0;
43
44 if (!adev->virt.fw_reserve.ras_telemetry)
45 return -EINVAL;
46
47 if (fw_va && fw_va <= adev->virt.fw_reserve.ras_telemetry) {
48 fw_vram_usage_start_offset = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].offset;
49 ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
50 (uintptr_t)fw_va;
51 } else if (drv_va && drv_va <= adev->virt.fw_reserve.ras_telemetry) {
52 fw_vram_usage_start_offset = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].offset;
53 ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
54 (uintptr_t)drv_va;
55 } else {
56 return -EINVAL;
57 }
58
59 ras_telemetry_cpu =
60 (struct amdsriov_ras_telemetry *)adev->virt.fw_reserve.ras_telemetry;
61 ras_telemetry_gpu =
62 (struct amdsriov_ras_telemetry *)(uintptr_t)(fw_vram_usage_start_offset +
63 ras_telemetry_offset);
64
65 if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) {
66 if (mem_size > AMD_SRIOV_UNIRAS_BLOCKS_BUF_SIZE)
67 return -ENOMEM;
68
69 shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.blocks_ecc_buf;
70 shared_mem->gpa =
71 (uintptr_t)ras_telemetry_gpu->uniras_shared_mem.blocks_ecc_buf -
72 adev->gmc.vram_start;
73 shared_mem->size = mem_size;
74 } else {
75 if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE)
76 return -ENOMEM;
77
78 shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.cmd_buf;
79 shared_mem->gpa =
80 (uintptr_t)ras_telemetry_gpu->uniras_shared_mem.cmd_buf -
81 adev->gmc.vram_start;
82 shared_mem->size = mem_size;
83 }
84
85 return 0;
86 }
87
amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * output_data,uint32_t output_size)88 static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
89 struct ras_cmd_ctx *cmd, void *output_data, uint32_t output_size)
90 {
91 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
92 struct amdgpu_virt_ras_cmd *virt_ras = ras_mgr->virt_ras_cmd;
93 uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, AMDGPU_GPU_PAGE_SIZE);
94 struct ras_cmd_ctx *rcmd;
95 struct amdgpu_virt_shared_mem shared_mem = {0};
96 int ret = 0;
97
98 mutex_lock(&virt_ras->remote_access_lock);
99
100 ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd->cmd_id, mem_len, &shared_mem);
101 if (ret)
102 goto out;
103
104 rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr;
105 memset(rcmd, 0, mem_len);
106 memcpy(rcmd, cmd, sizeof(*cmd));
107
108 ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev,
109 shared_mem.gpa, mem_len);
110 if (!ret) {
111 if (rcmd->cmd_res) {
112 ret = rcmd->cmd_res;
113 goto out;
114 }
115
116 cmd->cmd_res = rcmd->cmd_res;
117 cmd->output_size = rcmd->output_size;
118 if (rcmd->output_size && (rcmd->output_size <= output_size) && output_data)
119 memcpy(output_data, rcmd->output_buff_raw, rcmd->output_size);
120 }
121
122 out:
123 mutex_unlock(&virt_ras->remote_access_lock);
124 return ret;
125 }
126
amdgpu_virt_ras_send_remote_cmd(struct ras_core_context * ras_core,uint32_t cmd_id,void * input_data,uint32_t input_size,void * output_data,uint32_t output_size)127 static int amdgpu_virt_ras_send_remote_cmd(struct ras_core_context *ras_core,
128 uint32_t cmd_id, void *input_data, uint32_t input_size,
129 void *output_data, uint32_t output_size)
130 {
131 struct ras_cmd_ctx rcmd = {0};
132 int ret;
133
134 if (input_size > RAS_CMD_MAX_IN_SIZE)
135 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
136
137 rcmd.cmd_id = cmd_id;
138 rcmd.input_size = input_size;
139 memcpy(rcmd.input_buff_raw, input_data, input_size);
140
141 ret = amdgpu_virt_ras_remote_ioctl_cmd(ras_core,
142 &rcmd, output_data, output_size);
143 if (!ret) {
144 if (rcmd.output_size != output_size)
145 return RAS_CMD__ERROR_GENERIC;
146 }
147
148 return ret;
149 }
150
amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context * ras_core,struct ras_log_batch_overview * overview)151 static int amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context *ras_core,
152 struct ras_log_batch_overview *overview)
153 {
154 struct ras_cmd_batch_trace_snapshot_req req = {0};
155 struct ras_cmd_batch_trace_snapshot_rsp rsp = {0};
156 int ret;
157
158 ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
159 &req, sizeof(req), &rsp, sizeof(rsp));
160 if (ret)
161 return ret;
162
163 overview->first_batch_id = rsp.start_batch_id;
164 overview->last_batch_id = rsp.latest_batch_id;
165 overview->logged_batch_count = rsp.total_batch_num;
166
167 return RAS_CMD__SUCCESS;
168 }
169
amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)170 static int amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context *ras_core,
171 struct ras_cmd_ctx *cmd, void *data)
172 {
173 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
174 struct amdgpu_virt_ras_cmd *virt_ras =
175 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
176 int ret;
177
178 if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
179 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
180
181 ret = amdgpu_virt_ras_send_remote_cmd(ras_core, cmd->cmd_id,
182 cmd->input_buff_raw, cmd->input_size,
183 cmd->output_buff_raw, sizeof(struct ras_cmd_cper_snapshot_rsp));
184 if (ret)
185 return ret;
186
187 memset(&virt_ras->batch_mgr, 0, sizeof(virt_ras->batch_mgr));
188 amdgpu_virt_ras_get_batch_trace_overview(ras_core,
189 &virt_ras->batch_mgr.batch_overview);
190
191 cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
192 return RAS_CMD__SUCCESS;
193 }
194
amdgpu_virt_ras_get_batch_records(struct ras_core_context * ras_core,uint64_t batch_id,struct ras_log_info ** trace_arr,uint32_t arr_num,struct ras_cmd_batch_trace_record_rsp * rsp_cache)195 static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
196 struct ras_log_info **trace_arr, uint32_t arr_num,
197 struct ras_cmd_batch_trace_record_rsp *rsp_cache)
198 {
199 struct ras_cmd_batch_trace_record_req req = {
200 .start_batch_id = batch_id,
201 .batch_num = RAS_CMD_MAX_BATCH_NUM,
202 };
203 struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache;
204 struct batch_ras_trace_info *batch;
205 int ret = 0;
206 uint32_t i;
207
208 if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) ||
209 (batch_id >= (rsp->start_batch_id + rsp->real_batch_num))) {
210
211 memset(rsp, 0, sizeof(*rsp));
212 ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_RECORD,
213 &req, sizeof(req), rsp, sizeof(*rsp));
214 if (ret)
215 return -EPIPE;
216 }
217
218 batch = &rsp->batchs[batch_id - rsp->start_batch_id];
219 if (batch_id != batch->batch_id)
220 return -ENODATA;
221
222 for (i = 0; i < batch->trace_num; i++) {
223 if (i >= arr_num)
224 break;
225 trace_arr[i] = &rsp->records[batch->offset + i];
226 }
227
228 return i;
229 }
230
amdgpu_virt_ras_get_cper_records(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)231 static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
232 struct ras_cmd_ctx *cmd, void *data)
233 {
234 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
235 struct amdgpu_virt_ras_cmd *virt_ras =
236 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
237 struct ras_cmd_cper_record_req *req =
238 (struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
239 struct ras_cmd_cper_record_rsp *rsp =
240 (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
241 struct ras_log_batch_overview *overview = &virt_ras->batch_mgr.batch_overview;
242 struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras->batch_mgr.batch_trace;
243 struct ras_log_info **trace;
244 uint32_t offset = 0, real_data_len = 0;
245 uint64_t batch_id;
246 uint8_t *out_buf;
247 int ret = 0, i, count;
248
249 if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
250 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
251
252 if (!req->buf_size || !req->buf_ptr || !req->cper_num)
253 return RAS_CMD__ERROR_INVALID_INPUT_DATA;
254
255 trace = kzalloc_objs(*trace, MAX_RECORD_PER_BATCH);
256 if (!trace)
257 return RAS_CMD__ERROR_GENERIC;
258
259 out_buf = kzalloc(req->buf_size, GFP_KERNEL);
260 if (!out_buf) {
261 kfree(trace);
262 return RAS_CMD__ERROR_GENERIC;
263 }
264
265 memset(out_buf, 0, req->buf_size);
266
267 for (i = 0; i < req->cper_num; i++) {
268 batch_id = req->cper_start_id + i;
269 if (batch_id >= overview->last_batch_id)
270 break;
271 count = amdgpu_virt_ras_get_batch_records(ras_core, batch_id,
272 trace, MAX_RECORD_PER_BATCH,
273 rsp_cache);
274 if (count > 0) {
275 ret = ras_cper_generate_cper(ras_core, trace, count,
276 &out_buf[offset], req->buf_size - offset, &real_data_len);
277 if (ret)
278 break;
279
280 offset += real_data_len;
281 }
282 }
283
284 if ((ret && (ret != -ENOMEM)) ||
285 copy_to_user(u64_to_user_ptr(req->buf_ptr), out_buf, offset)) {
286 kfree(out_buf);
287 kfree(trace);
288 return RAS_CMD__ERROR_GENERIC;
289 }
290
291 rsp->real_data_size = offset;
292 rsp->real_cper_num = i;
293 rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
294 rsp->version = 0;
295
296 cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
297
298 kfree(out_buf);
299 kfree(trace);
300
301 return RAS_CMD__SUCCESS;
302 }
303
__fill_get_blocks_ecc_cmd(struct amdgpu_device * adev,struct vram_blocks_ecc * blks_ecc)304 static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
305 struct vram_blocks_ecc *blks_ecc)
306 {
307 struct ras_cmd_ctx *rcmd;
308
309 if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr)
310 return -EINVAL;
311
312 rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr;
313
314 rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
315 rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
316 rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd);
317
318 return 0;
319 }
320
__set_cmd_auto_update(struct amdgpu_device * adev,enum ras_cmd_id cmd_id,uint64_t gpa_addr,uint32_t len,bool reg)321 static int __set_cmd_auto_update(struct amdgpu_device *adev,
322 enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
323 {
324 struct ras_cmd_auto_update_req req = {0};
325 struct ras_cmd_auto_update_rsp rsp = {0};
326 int ret;
327
328 req.mode = reg ? 1 : 0;
329 req.cmd_id = cmd_id;
330 req.addr = gpa_addr;
331 req.len = len;
332 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
333 &req, sizeof(req), &rsp, sizeof(rsp));
334
335 return ret;
336 }
337
amdgpu_virt_ras_get_block_ecc(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)338 static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
339 struct ras_cmd_ctx *cmd, void *data)
340 {
341 struct amdgpu_device *adev = ras_core->dev;
342 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
343 struct amdgpu_virt_ras_cmd *virt_ras =
344 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
345 struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
346 struct ras_cmd_ctx *blks_ecc_cmd_ctx;
347 struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
348 struct ras_cmd_block_ecc_info_req *input_data =
349 (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
350 struct ras_cmd_block_ecc_info_rsp *output_data =
351 (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
352 int ret = 0;
353
354 if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
355 return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
356
357 if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
358 return RAS_CMD__ERROR_INVALID_INPUT_DATA;
359
360 if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
361 return RAS_CMD__ERROR_GENERIC;
362
363 if (!virt_ras->blocks_ecc.auto_update_actived) {
364 ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
365 blks_ecc->shared_mem.gpa,
366 blks_ecc->shared_mem.size, true);
367 if (ret)
368 return ret;
369
370 blks_ecc->auto_update_actived = true;
371 }
372
373 blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr;
374 blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
375
376 output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
377 output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
378 output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;
379
380 cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
381 return RAS_CMD__SUCCESS;
382 }
383
amdgpu_virt_ras_check_address_validity(struct amdgpu_device * adev,uint64_t address,bool * hit)384 int amdgpu_virt_ras_check_address_validity(struct amdgpu_device *adev,
385 uint64_t address, bool *hit)
386 {
387 struct ras_cmd_address_check_req req = {0};
388 struct ras_cmd_address_check_rsp rsp = {0};
389 int ret = 0;
390
391 req.address = address;
392
393 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CHECK_ADDRESS_VALIDITY,
394 &req, sizeof(req), &rsp, sizeof(rsp));
395
396 if (ret)
397 return RAS_CMD__ERROR_GENERIC;
398
399 *hit = rsp.result ? true : false;
400
401 return RAS_CMD__SUCCESS;
402 }
403
amdgpu_virt_ras_convert_retired_address(struct amdgpu_device * adev,uint64_t address,uint64_t * pfn,uint32_t max_pfn_sz)404 int amdgpu_virt_ras_convert_retired_address(struct amdgpu_device *adev,
405 uint64_t address, uint64_t *pfn, uint32_t max_pfn_sz)
406 {
407 struct ras_cmd_convert_retired_address_req req = {0};
408 struct ras_cmd_convert_retired_address_rsp rsp = {0};
409 int ret = 0, i;
410 int retired_page_count;
411
412 if (!pfn || !max_pfn_sz)
413 return -EINVAL;
414
415 req.address = address;
416
417 ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CONVERT_RETIRED_ADDRESS,
418 &req, sizeof(req), &rsp, sizeof(rsp));
419
420 if (ret || rsp.retired_count == 0)
421 return -EINVAL;
422
423 retired_page_count = rsp.retired_count > max_pfn_sz ? max_pfn_sz : rsp.retired_count;
424
425 for (i = 0; i < retired_page_count; i++)
426 pfn[i] = rsp.retired_addr[i] >> AMDGPU_GPU_PAGE_SHIFT;
427
428 return retired_page_count;
429 }
430
431 static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
432 {RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
433 {RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
434 {RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
435 };
436
amdgpu_virt_ras_handle_cmd(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd)437 int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
438 struct ras_cmd_ctx *cmd)
439 {
440 struct ras_cmd_func_map *ras_cmd = NULL;
441 int i, res;
442
443 for (i = 0; i < ARRAY_SIZE(amdgpu_virt_ras_cmd_maps); i++) {
444 if (cmd->cmd_id == amdgpu_virt_ras_cmd_maps[i].cmd_id) {
445 ras_cmd = &amdgpu_virt_ras_cmd_maps[i];
446 break;
447 }
448 }
449
450 if (ras_cmd)
451 res = ras_cmd->func(ras_core, cmd, NULL);
452 else
453 res = amdgpu_virt_ras_remote_ioctl_cmd(ras_core, cmd,
454 cmd->output_buff_raw, cmd->output_buf_size);
455
456 cmd->cmd_res = res;
457
458 if (cmd->output_size > cmd->output_buf_size) {
459 RAS_DEV_ERR(ras_core->dev,
460 "Output data size 0x%x exceeds buffer size 0x%x!\n",
461 cmd->output_size, cmd->output_buf_size);
462 return RAS_CMD__SUCCESS_EXEED_BUFFER;
463 }
464
465 return RAS_CMD__SUCCESS;
466 }
467
amdgpu_virt_ras_sw_init(struct amdgpu_device * adev)468 int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev)
469 {
470 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
471 struct amdgpu_virt_ras_cmd *virt_ras_cmd;
472
473 ras_mgr->virt_ras_cmd = kzalloc_obj(struct amdgpu_virt_ras_cmd);
474 if (!ras_mgr->virt_ras_cmd)
475 return -ENOMEM;
476
477 virt_ras_cmd = ras_mgr->virt_ras_cmd;
478 mutex_init(&virt_ras_cmd->remote_access_lock);
479
480 return 0;
481 }
482
amdgpu_virt_ras_sw_fini(struct amdgpu_device * adev)483 int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
484 {
485 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
486 struct amdgpu_virt_ras_cmd *virt_ras_cmd = ras_mgr->virt_ras_cmd;
487
488 mutex_destroy(&virt_ras_cmd->remote_access_lock);
489 kfree(ras_mgr->virt_ras_cmd);
490 ras_mgr->virt_ras_cmd = NULL;
491
492 return 0;
493 }
494
amdgpu_virt_ras_hw_init(struct amdgpu_device * adev)495 int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
496 {
497 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
498 struct amdgpu_virt_ras_cmd *virt_ras =
499 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
500 struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
501
502 amdgpu_virt_get_ras_capability(adev);
503
504 memset(blks_ecc, 0, sizeof(*blks_ecc));
505 if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core,
506 RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
507 AMD_SRIOV_UNIRAS_BLOCKS_BUF_SIZE, &blks_ecc->shared_mem))
508 return -ENOMEM;
509
510 return 0;
511 }
512
amdgpu_virt_ras_hw_fini(struct amdgpu_device * adev)513 int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
514 {
515 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
516 struct amdgpu_virt_ras_cmd *virt_ras =
517 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
518 struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
519
520 if (blks_ecc->shared_mem.cpu_addr)
521 memset(blks_ecc->shared_mem.cpu_addr, 0, blks_ecc->shared_mem.size);
522
523 memset(blks_ecc, 0, sizeof(*blks_ecc));
524
525 return 0;
526 }
527
amdgpu_virt_ras_pre_reset(struct amdgpu_device * adev)528 int amdgpu_virt_ras_pre_reset(struct amdgpu_device *adev)
529 {
530 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
531 struct amdgpu_virt_ras_cmd *virt_ras =
532 (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
533
534 virt_ras->blocks_ecc.auto_update_actived = false;
535 return 0;
536 }
537
amdgpu_virt_ras_post_reset(struct amdgpu_device * adev)538 int amdgpu_virt_ras_post_reset(struct amdgpu_device *adev)
539 {
540 return 0;
541 }
542
amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device * adev,bool en)543 void amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device *adev, bool en)
544 {
545 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
546 struct amdgpu_virt_ras_cmd *virt_ras;
547
548 if (!ras_mgr || !ras_mgr->virt_ras_cmd)
549 return;
550
551 virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
552 virt_ras->remote_uniras_supported = en;
553 }
554
amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device * adev)555 bool amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device *adev)
556 {
557 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
558 struct amdgpu_virt_ras_cmd *virt_ras;
559
560 if (amdgpu_in_reset(adev))
561 return false;
562
563 if (!ras_mgr || !ras_mgr->virt_ras_cmd)
564 return false;
565
566 virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
567
568 return virt_ras->remote_uniras_supported;
569 }
570