xref: /linux/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c (revision f9db1378f11092c4f22737331c4f2adad1bc0045)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/pci.h>
26 #include "amdgpu.h"
27 #include "amdgpu_ras.h"
28 #include "ras_sys.h"
29 #include "amdgpu_ras_cmd.h"
30 #include "amdgpu_virt_ras_cmd.h"
31 #include "amdgpu_ras_mgr.h"
32 
33 static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context *ras_core,
34 		uint32_t cmd, uint32_t mem_size, struct amdgpu_virt_shared_mem *shared_mem)
35 {
36 	struct amdgpu_device *adev = ras_core->dev;
37 	struct amdsriov_ras_telemetry *ras_telemetry_cpu;
38 	struct amdsriov_ras_telemetry *ras_telemetry_gpu;
39 	void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr;
40 	void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr;
41 	uint64_t fw_vram_usage_start_offset = 0;
42 	uint64_t ras_telemetry_offset = 0;
43 
44 	if (!adev->virt.fw_reserve.ras_telemetry)
45 		return -EINVAL;
46 
47 	if (fw_va && fw_va <= adev->virt.fw_reserve.ras_telemetry) {
48 		fw_vram_usage_start_offset = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].offset;
49 		ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
50 				(uintptr_t)fw_va;
51 	} else if (drv_va && drv_va <= adev->virt.fw_reserve.ras_telemetry) {
52 		fw_vram_usage_start_offset = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].offset;
53 		ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
54 				(uintptr_t)drv_va;
55 	} else {
56 		return -EINVAL;
57 	}
58 
59 	ras_telemetry_cpu =
60 		(struct amdsriov_ras_telemetry *)adev->virt.fw_reserve.ras_telemetry;
61 	ras_telemetry_gpu =
62 		(struct amdsriov_ras_telemetry *)(uintptr_t)(fw_vram_usage_start_offset +
63 				ras_telemetry_offset);
64 
65 	if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) {
66 		if (mem_size > AMD_SRIOV_UNIRAS_BLOCKS_BUF_SIZE)
67 			return -ENOMEM;
68 
69 		shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.blocks_ecc_buf;
70 		shared_mem->gpa =
71 			(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.blocks_ecc_buf -
72 					adev->gmc.vram_start;
73 		shared_mem->size = mem_size;
74 	} else {
75 		if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE)
76 			return -ENOMEM;
77 
78 		shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.cmd_buf;
79 		shared_mem->gpa =
80 			(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.cmd_buf -
81 					adev->gmc.vram_start;
82 		shared_mem->size = mem_size;
83 	}
84 
85 	return 0;
86 }
87 
88 static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
89 			struct ras_cmd_ctx *cmd, void *output_data, uint32_t output_size)
90 {
91 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
92 	struct amdgpu_virt_ras_cmd *virt_ras = ras_mgr->virt_ras_cmd;
93 	uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, AMDGPU_GPU_PAGE_SIZE);
94 	struct ras_cmd_ctx *rcmd;
95 	struct ras_cmd_ctx hdr_snap;
96 	struct amdgpu_virt_shared_mem shared_mem = {0};
97 	int ret = 0;
98 
99 	mutex_lock(&virt_ras->remote_access_lock);
100 
101 	ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd->cmd_id, mem_len, &shared_mem);
102 	if (ret)
103 		goto out;
104 
105 	rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr;
106 	memset(rcmd, 0, mem_len);
107 	memcpy(rcmd, cmd, sizeof(*cmd));
108 
109 	ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev,
110 				shared_mem.gpa, mem_len);
111 	if (!ret) {
112 		/*
113 		 * rcmd lives in shared memory the PF can mutate at any time.
114 		 * Snapshot the entire fixed-size response header into a local
115 		 * struct in one shot so every subsequent decision (cmd_res,
116 		 * output_size, version, etc.) operates on a stable copy. This
117 		 * defeats double-fetch / TOCTOU attacks where a malicious or
118 		 * buggy PF could flip cmd_res from SUCCESS to an error after
119 		 * our success branch, or enlarge output_size between the
120 		 * bounds check and the memcpy below to corrupt the caller's
121 		 * local output buffer.
122 		 */
123 		memcpy(&hdr_snap, rcmd, sizeof(hdr_snap));
124 		barrier();
125 
126 		if (hdr_snap.cmd_res) {
127 			ret = hdr_snap.cmd_res;
128 			goto out;
129 		}
130 
131 		cmd->cmd_res = hdr_snap.cmd_res;
132 		cmd->output_size = hdr_snap.output_size;
133 
134 		if (hdr_snap.output_size && output_data &&
135 		    hdr_snap.output_size <= output_size)
136 			memcpy(output_data, rcmd->output_buff_raw, hdr_snap.output_size);
137 	}
138 
139 out:
140 	mutex_unlock(&virt_ras->remote_access_lock);
141 	return ret;
142 }
143 
144 static int amdgpu_virt_ras_send_remote_cmd(struct ras_core_context *ras_core,
145 	uint32_t cmd_id, void *input_data, uint32_t input_size,
146 	void *output_data, uint32_t output_size)
147 {
148 	struct ras_cmd_ctx rcmd = {0};
149 	int ret;
150 
151 	if (input_size > RAS_CMD_MAX_IN_SIZE)
152 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
153 
154 	rcmd.cmd_id = cmd_id;
155 	rcmd.input_size = input_size;
156 	memcpy(rcmd.input_buff_raw, input_data, input_size);
157 
158 	ret = amdgpu_virt_ras_remote_ioctl_cmd(ras_core,
159 				&rcmd, output_data, output_size);
160 	if (!ret) {
161 		if (rcmd.output_size != output_size)
162 			return RAS_CMD__ERROR_GENERIC;
163 	}
164 
165 	return ret;
166 }
167 
168 static int amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context *ras_core,
169 	struct ras_log_batch_overview *overview)
170 {
171 	struct ras_cmd_batch_trace_snapshot_req req = {0};
172 	struct ras_cmd_batch_trace_snapshot_rsp rsp = {0};
173 	int ret;
174 
175 	ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
176 				&req, sizeof(req), &rsp, sizeof(rsp));
177 	if (ret)
178 		return ret;
179 
180 	overview->first_batch_id = rsp.start_batch_id;
181 	overview->last_batch_id = rsp.latest_batch_id;
182 	overview->logged_batch_count = rsp.total_batch_num;
183 
184 	return RAS_CMD__SUCCESS;
185 }
186 
187 static int amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context *ras_core,
188 			struct ras_cmd_ctx *cmd, void *data)
189 {
190 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
191 	struct amdgpu_virt_ras_cmd *virt_ras =
192 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
193 	int ret;
194 
195 	if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
196 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
197 
198 	ret = amdgpu_virt_ras_send_remote_cmd(ras_core, cmd->cmd_id,
199 			cmd->input_buff_raw, cmd->input_size,
200 			cmd->output_buff_raw, sizeof(struct ras_cmd_cper_snapshot_rsp));
201 	if (ret)
202 		return ret;
203 
204 	memset(&virt_ras->batch_mgr, 0, sizeof(virt_ras->batch_mgr));
205 	amdgpu_virt_ras_get_batch_trace_overview(ras_core,
206 					&virt_ras->batch_mgr.batch_overview);
207 
208 	cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
209 	return RAS_CMD__SUCCESS;
210 }
211 
212 static bool amdgpu_virt_ras_check_batch_cached(struct ras_cmd_batch_trace_record_rsp *rsp,
213 				       uint64_t batch_id)
214 {
215 	return rsp->real_batch_num &&
216 	       rsp->real_batch_num <= RAS_CMD_MAX_BATCH_NUM &&
217 	       batch_id >= rsp->start_batch_id &&
218 	       (batch_id - rsp->start_batch_id) < rsp->real_batch_num;
219 }
220 
221 static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
222 			struct ras_log_info *trace_arr, uint32_t arr_num,
223 			struct ras_cmd_batch_trace_record_rsp *rsp_cache)
224 {
225 	struct ras_cmd_batch_trace_record_req req = {
226 		.start_batch_id = batch_id,
227 		.batch_num = RAS_CMD_MAX_BATCH_NUM,
228 	};
229 	struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache;
230 	struct batch_ras_trace_info *batch;
231 	int ret = 0;
232 	uint32_t i;
233 	uint32_t idx;
234 
235 	if (!amdgpu_virt_ras_check_batch_cached(rsp, batch_id)) {
236 		memset(rsp, 0, sizeof(*rsp));
237 		ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_RECORD,
238 			&req, sizeof(req), rsp, sizeof(*rsp));
239 		if (ret)
240 			return -EPIPE;
241 
242 		if (!amdgpu_virt_ras_check_batch_cached(rsp, batch_id)) {
243 			memset(rsp, 0, sizeof(*rsp));
244 			return -EIO;
245 		}
246 	}
247 
248 	idx = (uint32_t)(batch_id - rsp->start_batch_id);
249 	batch = &rsp->batchs[idx];
250 	if (batch_id != batch->batch_id ||
251 	    batch->trace_num > MAX_RECORD_PER_BATCH ||
252 	    (uint32_t)batch->offset + batch->trace_num > RAS_CMD_MAX_TRACE_NUM) {
253 		memset(rsp, 0, sizeof(*rsp));
254 		return -EIO;
255 	}
256 
257 	for (i = 0; i < batch->trace_num && i < arr_num; i++)
258 		memcpy(&trace_arr[i],
259 			&rsp->records[batch->offset + i], sizeof(*trace_arr));
260 
261 	return i;
262 }
263 
264 static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
265 	struct ras_cmd_ctx *cmd, void *data)
266 {
267 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
268 	struct amdgpu_virt_ras_cmd *virt_ras =
269 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
270 	struct ras_cmd_cper_record_req *req =
271 		(struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
272 	struct ras_cmd_cper_record_rsp *rsp =
273 		(struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
274 	struct ras_log_batch_overview *overview = &virt_ras->batch_mgr.batch_overview;
275 	struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras->batch_mgr.batch_trace;
276 	struct ras_log_info *trace;
277 	uint32_t trace_count = MAX_RECORD_PER_BATCH;
278 	uint32_t offset = 0, real_data_len = 0;
279 	uint64_t batch_id;
280 	uint8_t *out_buf;
281 	int ret = 0, i, count;
282 
283 	if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req) ||
284 		(cmd->output_buf_size < sizeof(*rsp)))
285 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
286 
287 	if (!req->buf_size || !req->buf_ptr || !req->cper_num ||
288 	    req->buf_size > RAS_CMD_MAX_CPER_BUF_SZ)
289 		return RAS_CMD__ERROR_INVALID_INPUT_DATA;
290 
291 	trace = kzalloc_objs(*trace, trace_count);
292 	if (!trace)
293 		return RAS_CMD__ERROR_GENERIC;
294 
295 	out_buf = kzalloc(req->buf_size, GFP_KERNEL);
296 	if (!out_buf) {
297 		kfree(trace);
298 		return RAS_CMD__ERROR_GENERIC;
299 	}
300 
301 	memset(out_buf, 0, req->buf_size);
302 
303 	for (i = 0; i < req->cper_num; i++) {
304 		batch_id = req->cper_start_id + i;
305 		if (batch_id >= overview->last_batch_id)
306 			break;
307 		count = amdgpu_virt_ras_get_batch_records(ras_core, batch_id,
308 							  trace, trace_count,
309 							  rsp_cache);
310 		if (count > 0) {
311 			ret = ras_cper_generate_cper(ras_core, trace, count,
312 					&out_buf[offset], req->buf_size - offset, &real_data_len);
313 			if (ret)
314 				break;
315 
316 			offset += real_data_len;
317 		}
318 	}
319 
320 	if ((ret && (ret != -ENOMEM)) ||
321 	    copy_to_user(u64_to_user_ptr(req->buf_ptr), out_buf, offset)) {
322 		kfree(out_buf);
323 		kfree(trace);
324 		return RAS_CMD__ERROR_GENERIC;
325 	}
326 
327 	rsp->real_data_size = offset;
328 	rsp->real_cper_num = i;
329 	rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
330 	rsp->version = 0;
331 
332 	cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
333 
334 	kfree(out_buf);
335 	kfree(trace);
336 
337 	return RAS_CMD__SUCCESS;
338 }
339 
340 static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
341 			struct vram_blocks_ecc *blks_ecc)
342 {
343 	struct ras_cmd_ctx *rcmd;
344 
345 	if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr)
346 		return -EINVAL;
347 
348 	rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr;
349 
350 	rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
351 	rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
352 	rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd);
353 
354 	return 0;
355 }
356 
357 static int __set_cmd_auto_update(struct amdgpu_device *adev,
358 			enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
359 {
360 	struct ras_cmd_auto_update_req req = {0};
361 	struct ras_cmd_auto_update_rsp rsp = {0};
362 	int ret;
363 
364 	req.mode = reg ? 1 : 0;
365 	req.cmd_id = cmd_id;
366 	req.addr = gpa_addr;
367 	req.len = len;
368 	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
369 		&req, sizeof(req), &rsp, sizeof(rsp));
370 
371 	return ret;
372 }
373 
374 static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
375 				struct ras_cmd_ctx *cmd, void *data)
376 {
377 	struct amdgpu_device *adev = ras_core->dev;
378 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
379 	struct amdgpu_virt_ras_cmd *virt_ras =
380 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
381 	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
382 	struct ras_cmd_ctx *blks_ecc_cmd_ctx;
383 	struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
384 	struct ras_cmd_block_ecc_info_req *input_data =
385 			(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
386 	struct ras_cmd_block_ecc_info_rsp *output_data =
387 			(struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
388 	int ret = 0;
389 
390 	if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
391 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
392 
393 	if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
394 		return RAS_CMD__ERROR_INVALID_INPUT_DATA;
395 
396 	if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
397 		return RAS_CMD__ERROR_GENERIC;
398 
399 	if (!virt_ras->blocks_ecc.auto_update_actived) {
400 		ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
401 				blks_ecc->shared_mem.gpa,
402 				blks_ecc->shared_mem.size, true);
403 		if (ret)
404 			return ret;
405 
406 		blks_ecc->auto_update_actived = true;
407 	}
408 
409 	blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr;
410 	blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
411 
412 	output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
413 	output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
414 	output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;
415 
416 	cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
417 	return RAS_CMD__SUCCESS;
418 }
419 
420 int amdgpu_virt_ras_check_address_validity(struct amdgpu_device *adev,
421 			uint64_t address, bool *hit)
422 {
423 	struct ras_cmd_address_check_req req = {0};
424 	struct ras_cmd_address_check_rsp rsp = {0};
425 	int ret = 0;
426 
427 	req.address = address;
428 
429 	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CHECK_ADDRESS_VALIDITY,
430 		&req, sizeof(req), &rsp, sizeof(rsp));
431 
432 	if (ret)
433 		return RAS_CMD__ERROR_GENERIC;
434 
435 	*hit = rsp.result ? true : false;
436 
437 	return RAS_CMD__SUCCESS;
438 }
439 
440 int amdgpu_virt_ras_convert_retired_address(struct amdgpu_device *adev,
441 			uint64_t address, uint64_t *pfn, uint32_t max_pfn_sz)
442 {
443 	struct ras_cmd_convert_retired_address_req req = {0};
444 	struct ras_cmd_convert_retired_address_rsp rsp = {0};
445 	int ret = 0, i;
446 	int retired_page_count;
447 
448 	if (!pfn || !max_pfn_sz)
449 		return -EINVAL;
450 
451 	req.address = address;
452 
453 	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CONVERT_RETIRED_ADDRESS,
454 		&req, sizeof(req), &rsp, sizeof(rsp));
455 
456 	if (ret || rsp.retired_count == 0)
457 		return -EINVAL;
458 
459 	retired_page_count = rsp.retired_count > max_pfn_sz ? max_pfn_sz : rsp.retired_count;
460 
461 	for (i = 0; i < retired_page_count; i++)
462 		pfn[i] = rsp.retired_addr[i] >> AMDGPU_GPU_PAGE_SHIFT;
463 
464 	return retired_page_count;
465 }
466 
467 static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
468 	{RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
469 	{RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
470 	{RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
471 };
472 
473 int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
474 		struct ras_cmd_ctx *cmd)
475 {
476 	struct ras_cmd_func_map *ras_cmd = NULL;
477 	int i, res;
478 
479 	for (i = 0; i < ARRAY_SIZE(amdgpu_virt_ras_cmd_maps); i++) {
480 		if (cmd->cmd_id == amdgpu_virt_ras_cmd_maps[i].cmd_id) {
481 			ras_cmd = &amdgpu_virt_ras_cmd_maps[i];
482 			break;
483 		}
484 	}
485 
486 	if (ras_cmd)
487 		res = ras_cmd->func(ras_core, cmd, NULL);
488 	else
489 		res = amdgpu_virt_ras_remote_ioctl_cmd(ras_core, cmd,
490 					cmd->output_buff_raw, cmd->output_buf_size);
491 
492 	cmd->cmd_res = res;
493 
494 	if (!res && (cmd->output_size > cmd->output_buf_size)) {
495 		RAS_DEV_ERR(ras_core->dev,
496 			"Output data size 0x%x exceeds buffer size 0x%x!\n",
497 			cmd->output_size, cmd->output_buf_size);
498 		return RAS_CMD__SUCCESS_EXEED_BUFFER;
499 	}
500 
501 	return RAS_CMD__SUCCESS;
502 }
503 
504 int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev)
505 {
506 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
507 	struct amdgpu_virt_ras_cmd *virt_ras_cmd;
508 
509 	ras_mgr->virt_ras_cmd = kzalloc_obj(struct amdgpu_virt_ras_cmd);
510 	if (!ras_mgr->virt_ras_cmd)
511 		return -ENOMEM;
512 
513 	virt_ras_cmd = ras_mgr->virt_ras_cmd;
514 	mutex_init(&virt_ras_cmd->remote_access_lock);
515 
516 	return 0;
517 }
518 
519 int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
520 {
521 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
522 	struct amdgpu_virt_ras_cmd *virt_ras_cmd = ras_mgr->virt_ras_cmd;
523 
524 	mutex_destroy(&virt_ras_cmd->remote_access_lock);
525 	kfree(ras_mgr->virt_ras_cmd);
526 	ras_mgr->virt_ras_cmd = NULL;
527 
528 	return 0;
529 }
530 
531 int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
532 {
533 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
534 	struct amdgpu_virt_ras_cmd *virt_ras =
535 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
536 	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
537 
538 	amdgpu_virt_get_ras_capability(adev);
539 
540 	memset(blks_ecc, 0, sizeof(*blks_ecc));
541 	if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core,
542 			RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
543 			AMD_SRIOV_UNIRAS_BLOCKS_BUF_SIZE, &blks_ecc->shared_mem))
544 		return -ENOMEM;
545 
546 	return 0;
547 }
548 
549 int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
550 {
551 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
552 	struct amdgpu_virt_ras_cmd *virt_ras =
553 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
554 	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
555 
556 	if (blks_ecc->shared_mem.cpu_addr)
557 		memset(blks_ecc->shared_mem.cpu_addr, 0, blks_ecc->shared_mem.size);
558 
559 	memset(blks_ecc, 0, sizeof(*blks_ecc));
560 
561 	return 0;
562 }
563 
564 int amdgpu_virt_ras_pre_reset(struct amdgpu_device *adev)
565 {
566 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
567 	struct amdgpu_virt_ras_cmd *virt_ras =
568 		(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
569 
570 	virt_ras->blocks_ecc.auto_update_actived = false;
571 	return 0;
572 }
573 
574 int amdgpu_virt_ras_post_reset(struct amdgpu_device *adev)
575 {
576 	return 0;
577 }
578 
579 void amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device *adev, bool en)
580 {
581 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
582 	struct amdgpu_virt_ras_cmd *virt_ras;
583 
584 	if (!ras_mgr || !ras_mgr->virt_ras_cmd)
585 		return;
586 
587 	virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
588 	virt_ras->remote_uniras_supported = en;
589 }
590 
591 bool amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device *adev)
592 {
593 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
594 	struct amdgpu_virt_ras_cmd *virt_ras;
595 
596 	if (amdgpu_in_reset(adev))
597 		return false;
598 
599 	if (!ras_mgr || !ras_mgr->virt_ras_cmd)
600 		return false;
601 
602 	virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
603 
604 	return virt_ras->remote_uniras_supported;
605 }
606