xref: /linux/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_virt_ras_cmd.c (revision 92c4c9fdc838d3b41a996bb700ea64b9e78fc7ea)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/pci.h>
26 #include "amdgpu.h"
27 #include "amdgpu_ras.h"
28 #include "ras_sys.h"
29 #include "amdgpu_ras_cmd.h"
30 #include "amdgpu_virt_ras_cmd.h"
31 #include "amdgpu_ras_mgr.h"
32 
amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context * ras_core,uint32_t cmd,uint32_t mem_size,struct amdgpu_virt_shared_mem * shared_mem)33 static int amdgpu_virt_ras_get_cmd_shared_mem(struct ras_core_context *ras_core,
34 		uint32_t cmd, uint32_t mem_size, struct amdgpu_virt_shared_mem *shared_mem)
35 {
36 	struct amdgpu_device *adev = ras_core->dev;
37 	struct amdsriov_ras_telemetry *ras_telemetry_cpu;
38 	struct amdsriov_ras_telemetry *ras_telemetry_gpu;
39 	void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr;
40 	void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr;
41 	uint64_t fw_vram_usage_start_offset = 0;
42 	uint64_t ras_telemetry_offset = 0;
43 
44 	if (!adev->virt.fw_reserve.ras_telemetry)
45 		return -EINVAL;
46 
47 	if (fw_va && fw_va <= adev->virt.fw_reserve.ras_telemetry) {
48 		fw_vram_usage_start_offset = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].offset;
49 		ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
50 				(uintptr_t)fw_va;
51 	} else if (drv_va && drv_va <= adev->virt.fw_reserve.ras_telemetry) {
52 		fw_vram_usage_start_offset = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].offset;
53 		ras_telemetry_offset = (uintptr_t)adev->virt.fw_reserve.ras_telemetry -
54 				(uintptr_t)drv_va;
55 	} else {
56 		return -EINVAL;
57 	}
58 
59 	ras_telemetry_cpu =
60 		(struct amdsriov_ras_telemetry *)adev->virt.fw_reserve.ras_telemetry;
61 	ras_telemetry_gpu =
62 		(struct amdsriov_ras_telemetry *)(uintptr_t)(fw_vram_usage_start_offset +
63 				ras_telemetry_offset);
64 
65 	if (cmd == RAS_CMD__GET_ALL_BLOCK_ECC_STATUS) {
66 		if (mem_size > AMD_SRIOV_UNIRAS_BLOCKS_BUF_SIZE)
67 			return -ENOMEM;
68 
69 		shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.blocks_ecc_buf;
70 		shared_mem->gpa =
71 			(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.blocks_ecc_buf -
72 					adev->gmc.vram_start;
73 		shared_mem->size = mem_size;
74 	} else {
75 		if (mem_size > AMD_SRIOV_UNIRAS_CMD_MAX_SIZE)
76 			return -ENOMEM;
77 
78 		shared_mem->cpu_addr = ras_telemetry_cpu->uniras_shared_mem.cmd_buf;
79 		shared_mem->gpa =
80 			(uintptr_t)ras_telemetry_gpu->uniras_shared_mem.cmd_buf -
81 					adev->gmc.vram_start;
82 		shared_mem->size = mem_size;
83 	}
84 
85 	return 0;
86 }
87 
amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * output_data,uint32_t output_size)88 static int amdgpu_virt_ras_remote_ioctl_cmd(struct ras_core_context *ras_core,
89 			struct ras_cmd_ctx *cmd, void *output_data, uint32_t output_size)
90 {
91 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
92 	struct amdgpu_virt_ras_cmd *virt_ras = ras_mgr->virt_ras_cmd;
93 	uint32_t mem_len = ALIGN(sizeof(*cmd) + output_size, AMDGPU_GPU_PAGE_SIZE);
94 	struct ras_cmd_ctx *rcmd;
95 	struct amdgpu_virt_shared_mem shared_mem = {0};
96 	int ret = 0;
97 
98 	mutex_lock(&virt_ras->remote_access_lock);
99 
100 	ret = amdgpu_virt_ras_get_cmd_shared_mem(ras_core, cmd->cmd_id, mem_len, &shared_mem);
101 	if (ret)
102 		goto out;
103 
104 	rcmd = (struct ras_cmd_ctx *)shared_mem.cpu_addr;
105 	memset(rcmd, 0, mem_len);
106 	memcpy(rcmd, cmd, sizeof(*cmd));
107 
108 	ret = amdgpu_virt_send_remote_ras_cmd(ras_core->dev,
109 				shared_mem.gpa, mem_len);
110 	if (!ret) {
111 		if (rcmd->cmd_res) {
112 			ret = rcmd->cmd_res;
113 			goto out;
114 		}
115 
116 		cmd->cmd_res = rcmd->cmd_res;
117 		cmd->output_size = rcmd->output_size;
118 		if (rcmd->output_size && (rcmd->output_size <= output_size) && output_data)
119 			memcpy(output_data, rcmd->output_buff_raw, rcmd->output_size);
120 	}
121 
122 out:
123 	mutex_unlock(&virt_ras->remote_access_lock);
124 	return ret;
125 }
126 
amdgpu_virt_ras_send_remote_cmd(struct ras_core_context * ras_core,uint32_t cmd_id,void * input_data,uint32_t input_size,void * output_data,uint32_t output_size)127 static int amdgpu_virt_ras_send_remote_cmd(struct ras_core_context *ras_core,
128 	uint32_t cmd_id, void *input_data, uint32_t input_size,
129 	void *output_data, uint32_t output_size)
130 {
131 	struct ras_cmd_ctx rcmd = {0};
132 	int ret;
133 
134 	if (input_size > RAS_CMD_MAX_IN_SIZE)
135 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
136 
137 	rcmd.cmd_id = cmd_id;
138 	rcmd.input_size = input_size;
139 	memcpy(rcmd.input_buff_raw, input_data, input_size);
140 
141 	ret = amdgpu_virt_ras_remote_ioctl_cmd(ras_core,
142 				&rcmd, output_data, output_size);
143 	if (!ret) {
144 		if (rcmd.output_size != output_size)
145 			return RAS_CMD__ERROR_GENERIC;
146 	}
147 
148 	return ret;
149 }
150 
amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context * ras_core,struct ras_log_batch_overview * overview)151 static int amdgpu_virt_ras_get_batch_trace_overview(struct ras_core_context *ras_core,
152 	struct ras_log_batch_overview *overview)
153 {
154 	struct ras_cmd_batch_trace_snapshot_req req = {0};
155 	struct ras_cmd_batch_trace_snapshot_rsp rsp = {0};
156 	int ret;
157 
158 	ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
159 				&req, sizeof(req), &rsp, sizeof(rsp));
160 	if (ret)
161 		return ret;
162 
163 	overview->first_batch_id = rsp.start_batch_id;
164 	overview->last_batch_id = rsp.latest_batch_id;
165 	overview->logged_batch_count = rsp.total_batch_num;
166 
167 	return RAS_CMD__SUCCESS;
168 }
169 
amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)170 static int amdgpu_virt_ras_get_cper_snapshot(struct ras_core_context *ras_core,
171 			struct ras_cmd_ctx *cmd, void *data)
172 {
173 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
174 	struct amdgpu_virt_ras_cmd *virt_ras =
175 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
176 	int ret;
177 
178 	if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
179 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
180 
181 	ret = amdgpu_virt_ras_send_remote_cmd(ras_core, cmd->cmd_id,
182 			cmd->input_buff_raw, cmd->input_size,
183 			cmd->output_buff_raw, sizeof(struct ras_cmd_cper_snapshot_rsp));
184 	if (ret)
185 		return ret;
186 
187 	memset(&virt_ras->batch_mgr, 0, sizeof(virt_ras->batch_mgr));
188 	amdgpu_virt_ras_get_batch_trace_overview(ras_core,
189 					&virt_ras->batch_mgr.batch_overview);
190 
191 	cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
192 	return RAS_CMD__SUCCESS;
193 }
194 
amdgpu_virt_ras_get_batch_records(struct ras_core_context * ras_core,uint64_t batch_id,struct ras_log_info ** trace_arr,uint32_t arr_num,struct ras_cmd_batch_trace_record_rsp * rsp_cache)195 static int amdgpu_virt_ras_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
196 			struct ras_log_info **trace_arr, uint32_t arr_num,
197 			struct ras_cmd_batch_trace_record_rsp *rsp_cache)
198 {
199 	struct ras_cmd_batch_trace_record_req req = {
200 		.start_batch_id = batch_id,
201 		.batch_num = RAS_CMD_MAX_BATCH_NUM,
202 	};
203 	struct ras_cmd_batch_trace_record_rsp *rsp = rsp_cache;
204 	struct batch_ras_trace_info *batch;
205 	int ret = 0;
206 	uint32_t i;
207 
208 	if (!rsp->real_batch_num || (batch_id < rsp->start_batch_id) ||
209 		(batch_id >=  (rsp->start_batch_id + rsp->real_batch_num))) {
210 
211 		memset(rsp, 0, sizeof(*rsp));
212 		ret = amdgpu_virt_ras_send_remote_cmd(ras_core, RAS_CMD__GET_BATCH_TRACE_RECORD,
213 			&req, sizeof(req), rsp, sizeof(*rsp));
214 		if (ret)
215 			return -EPIPE;
216 	}
217 
218 	batch = &rsp->batchs[batch_id - rsp->start_batch_id];
219 	if (batch_id != batch->batch_id)
220 		return -ENODATA;
221 
222 	for (i = 0; i < batch->trace_num; i++) {
223 		if (i >= arr_num)
224 			break;
225 		trace_arr[i] = &rsp->records[batch->offset + i];
226 	}
227 
228 	return i;
229 }
230 
amdgpu_virt_ras_get_cper_records(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)231 static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
232 	struct ras_cmd_ctx *cmd, void *data)
233 {
234 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
235 	struct amdgpu_virt_ras_cmd *virt_ras =
236 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
237 	struct ras_cmd_cper_record_req *req =
238 		(struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
239 	struct ras_cmd_cper_record_rsp *rsp =
240 		(struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
241 	struct ras_log_batch_overview *overview = &virt_ras->batch_mgr.batch_overview;
242 	struct ras_cmd_batch_trace_record_rsp *rsp_cache = &virt_ras->batch_mgr.batch_trace;
243 	struct ras_log_info **trace;
244 	uint32_t offset = 0, real_data_len = 0;
245 	uint64_t batch_id;
246 	uint8_t *out_buf;
247 	int ret = 0, i, count;
248 
249 	if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
250 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
251 
252 	if (!req->buf_size || !req->buf_ptr || !req->cper_num)
253 		return RAS_CMD__ERROR_INVALID_INPUT_DATA;
254 
255 	trace = kzalloc_objs(*trace, MAX_RECORD_PER_BATCH);
256 	if (!trace)
257 		return RAS_CMD__ERROR_GENERIC;
258 
259 	out_buf = kzalloc(req->buf_size, GFP_KERNEL);
260 	if (!out_buf) {
261 		kfree(trace);
262 		return RAS_CMD__ERROR_GENERIC;
263 	}
264 
265 	memset(out_buf, 0, req->buf_size);
266 
267 	for (i = 0; i < req->cper_num; i++) {
268 		batch_id = req->cper_start_id + i;
269 		if (batch_id >= overview->last_batch_id)
270 			break;
271 		count = amdgpu_virt_ras_get_batch_records(ras_core, batch_id,
272 							  trace, MAX_RECORD_PER_BATCH,
273 							  rsp_cache);
274 		if (count > 0) {
275 			ret = ras_cper_generate_cper(ras_core, trace, count,
276 					&out_buf[offset], req->buf_size - offset, &real_data_len);
277 			if (ret)
278 				break;
279 
280 			offset += real_data_len;
281 		}
282 	}
283 
284 	if ((ret && (ret != -ENOMEM)) ||
285 	    copy_to_user(u64_to_user_ptr(req->buf_ptr), out_buf, offset)) {
286 		kfree(out_buf);
287 		kfree(trace);
288 		return RAS_CMD__ERROR_GENERIC;
289 	}
290 
291 	rsp->real_data_size = offset;
292 	rsp->real_cper_num = i;
293 	rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
294 	rsp->version = 0;
295 
296 	cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
297 
298 	kfree(out_buf);
299 	kfree(trace);
300 
301 	return RAS_CMD__SUCCESS;
302 }
303 
__fill_get_blocks_ecc_cmd(struct amdgpu_device * adev,struct vram_blocks_ecc * blks_ecc)304 static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
305 			struct vram_blocks_ecc *blks_ecc)
306 {
307 	struct ras_cmd_ctx *rcmd;
308 
309 	if (!blks_ecc || !blks_ecc->shared_mem.cpu_addr)
310 		return -EINVAL;
311 
312 	rcmd = (struct ras_cmd_ctx *)blks_ecc->shared_mem.cpu_addr;
313 
314 	rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
315 	rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
316 	rcmd->output_buf_size = blks_ecc->shared_mem.size - sizeof(*rcmd);
317 
318 	return 0;
319 }
320 
__set_cmd_auto_update(struct amdgpu_device * adev,enum ras_cmd_id cmd_id,uint64_t gpa_addr,uint32_t len,bool reg)321 static int __set_cmd_auto_update(struct amdgpu_device *adev,
322 			enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
323 {
324 	struct ras_cmd_auto_update_req req = {0};
325 	struct ras_cmd_auto_update_rsp rsp = {0};
326 	int ret;
327 
328 	req.mode = reg ? 1 : 0;
329 	req.cmd_id = cmd_id;
330 	req.addr = gpa_addr;
331 	req.len = len;
332 	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
333 		&req, sizeof(req), &rsp, sizeof(rsp));
334 
335 	return ret;
336 }
337 
amdgpu_virt_ras_get_block_ecc(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd,void * data)338 static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
339 				struct ras_cmd_ctx *cmd, void *data)
340 {
341 	struct amdgpu_device *adev = ras_core->dev;
342 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
343 	struct amdgpu_virt_ras_cmd *virt_ras =
344 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
345 	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
346 	struct ras_cmd_ctx *blks_ecc_cmd_ctx;
347 	struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
348 	struct ras_cmd_block_ecc_info_req *input_data =
349 			(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
350 	struct ras_cmd_block_ecc_info_rsp *output_data =
351 			(struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
352 	int ret = 0;
353 
354 	if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
355 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
356 
357 	if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
358 		return RAS_CMD__ERROR_INVALID_INPUT_DATA;
359 
360 	if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
361 		return RAS_CMD__ERROR_GENERIC;
362 
363 	if (!virt_ras->blocks_ecc.auto_update_actived) {
364 		ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
365 				blks_ecc->shared_mem.gpa,
366 				blks_ecc->shared_mem.size, true);
367 		if (ret)
368 			return ret;
369 
370 		blks_ecc->auto_update_actived = true;
371 	}
372 
373 	blks_ecc_cmd_ctx = blks_ecc->shared_mem.cpu_addr;
374 	blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
375 
376 	output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
377 	output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
378 	output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;
379 
380 	cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
381 	return RAS_CMD__SUCCESS;
382 }
383 
amdgpu_virt_ras_check_address_validity(struct amdgpu_device * adev,uint64_t address,bool * hit)384 int amdgpu_virt_ras_check_address_validity(struct amdgpu_device *adev,
385 			uint64_t address, bool *hit)
386 {
387 	struct ras_cmd_address_check_req req = {0};
388 	struct ras_cmd_address_check_rsp rsp = {0};
389 	int ret = 0;
390 
391 	req.address = address;
392 
393 	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CHECK_ADDRESS_VALIDITY,
394 		&req, sizeof(req), &rsp, sizeof(rsp));
395 
396 	if (ret)
397 		return RAS_CMD__ERROR_GENERIC;
398 
399 	*hit = rsp.result ? true : false;
400 
401 	return RAS_CMD__SUCCESS;
402 }
403 
amdgpu_virt_ras_convert_retired_address(struct amdgpu_device * adev,uint64_t address,uint64_t * pfn,uint32_t max_pfn_sz)404 int amdgpu_virt_ras_convert_retired_address(struct amdgpu_device *adev,
405 			uint64_t address, uint64_t *pfn, uint32_t max_pfn_sz)
406 {
407 	struct ras_cmd_convert_retired_address_req req = {0};
408 	struct ras_cmd_convert_retired_address_rsp rsp = {0};
409 	int ret = 0, i;
410 	int retired_page_count;
411 
412 	if (!pfn || !max_pfn_sz)
413 		return -EINVAL;
414 
415 	req.address = address;
416 
417 	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__CONVERT_RETIRED_ADDRESS,
418 		&req, sizeof(req), &rsp, sizeof(rsp));
419 
420 	if (ret || rsp.retired_count == 0)
421 		return -EINVAL;
422 
423 	retired_page_count = rsp.retired_count > max_pfn_sz ? max_pfn_sz : rsp.retired_count;
424 
425 	for (i = 0; i < retired_page_count; i++)
426 		pfn[i] = rsp.retired_addr[i] >> AMDGPU_GPU_PAGE_SHIFT;
427 
428 	return retired_page_count;
429 }
430 
431 static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
432 	{RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
433 	{RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
434 	{RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
435 };
436 
amdgpu_virt_ras_handle_cmd(struct ras_core_context * ras_core,struct ras_cmd_ctx * cmd)437 int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
438 		struct ras_cmd_ctx *cmd)
439 {
440 	struct ras_cmd_func_map *ras_cmd = NULL;
441 	int i, res;
442 
443 	for (i = 0; i < ARRAY_SIZE(amdgpu_virt_ras_cmd_maps); i++) {
444 		if (cmd->cmd_id == amdgpu_virt_ras_cmd_maps[i].cmd_id) {
445 			ras_cmd = &amdgpu_virt_ras_cmd_maps[i];
446 			break;
447 		}
448 	}
449 
450 	if (ras_cmd)
451 		res = ras_cmd->func(ras_core, cmd, NULL);
452 	else
453 		res = amdgpu_virt_ras_remote_ioctl_cmd(ras_core, cmd,
454 					cmd->output_buff_raw, cmd->output_buf_size);
455 
456 	cmd->cmd_res = res;
457 
458 	if (cmd->output_size > cmd->output_buf_size) {
459 		RAS_DEV_ERR(ras_core->dev,
460 			"Output data size 0x%x exceeds buffer size 0x%x!\n",
461 			cmd->output_size, cmd->output_buf_size);
462 		return RAS_CMD__SUCCESS_EXEED_BUFFER;
463 	}
464 
465 	return RAS_CMD__SUCCESS;
466 }
467 
amdgpu_virt_ras_sw_init(struct amdgpu_device * adev)468 int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev)
469 {
470 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
471 	struct amdgpu_virt_ras_cmd *virt_ras_cmd;
472 
473 	ras_mgr->virt_ras_cmd = kzalloc_obj(struct amdgpu_virt_ras_cmd);
474 	if (!ras_mgr->virt_ras_cmd)
475 		return -ENOMEM;
476 
477 	virt_ras_cmd = ras_mgr->virt_ras_cmd;
478 	mutex_init(&virt_ras_cmd->remote_access_lock);
479 
480 	return 0;
481 }
482 
amdgpu_virt_ras_sw_fini(struct amdgpu_device * adev)483 int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
484 {
485 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
486 	struct amdgpu_virt_ras_cmd *virt_ras_cmd = ras_mgr->virt_ras_cmd;
487 
488 	mutex_destroy(&virt_ras_cmd->remote_access_lock);
489 	kfree(ras_mgr->virt_ras_cmd);
490 	ras_mgr->virt_ras_cmd = NULL;
491 
492 	return 0;
493 }
494 
amdgpu_virt_ras_hw_init(struct amdgpu_device * adev)495 int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
496 {
497 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
498 	struct amdgpu_virt_ras_cmd *virt_ras =
499 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
500 	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
501 
502 	amdgpu_virt_get_ras_capability(adev);
503 
504 	memset(blks_ecc, 0, sizeof(*blks_ecc));
505 	if (amdgpu_virt_ras_get_cmd_shared_mem(ras_mgr->ras_core,
506 			RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
507 			AMD_SRIOV_UNIRAS_BLOCKS_BUF_SIZE, &blks_ecc->shared_mem))
508 		return -ENOMEM;
509 
510 	return 0;
511 }
512 
amdgpu_virt_ras_hw_fini(struct amdgpu_device * adev)513 int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
514 {
515 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
516 	struct amdgpu_virt_ras_cmd *virt_ras =
517 			(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
518 	struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
519 
520 	if (blks_ecc->shared_mem.cpu_addr)
521 		memset(blks_ecc->shared_mem.cpu_addr, 0, blks_ecc->shared_mem.size);
522 
523 	memset(blks_ecc, 0, sizeof(*blks_ecc));
524 
525 	return 0;
526 }
527 
amdgpu_virt_ras_pre_reset(struct amdgpu_device * adev)528 int amdgpu_virt_ras_pre_reset(struct amdgpu_device *adev)
529 {
530 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
531 	struct amdgpu_virt_ras_cmd *virt_ras =
532 		(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
533 
534 	virt_ras->blocks_ecc.auto_update_actived = false;
535 	return 0;
536 }
537 
amdgpu_virt_ras_post_reset(struct amdgpu_device * adev)538 int amdgpu_virt_ras_post_reset(struct amdgpu_device *adev)
539 {
540 	return 0;
541 }
542 
amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device * adev,bool en)543 void amdgpu_virt_ras_set_remote_uniras(struct amdgpu_device *adev, bool en)
544 {
545 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
546 	struct amdgpu_virt_ras_cmd *virt_ras;
547 
548 	if (!ras_mgr || !ras_mgr->virt_ras_cmd)
549 		return;
550 
551 	virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
552 	virt_ras->remote_uniras_supported = en;
553 }
554 
amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device * adev)555 bool amdgpu_virt_ras_remote_uniras_enabled(struct amdgpu_device *adev)
556 {
557 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
558 	struct amdgpu_virt_ras_cmd *virt_ras;
559 
560 	if (amdgpu_in_reset(adev))
561 		return false;
562 
563 	if (!ras_mgr || !ras_mgr->virt_ras_cmd)
564 		return false;
565 
566 	virt_ras = (struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
567 
568 	return virt_ras->remote_uniras_supported;
569 }
570