xref: /linux/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c (revision bf4afc53b77aeaa48b5409da5c8da6bb4eff7f43)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include "amdgpu.h"
25 #include "amdgpu_reset.h"
26 #include "amdgpu_xgmi.h"
27 #include "ras_sys.h"
28 #include "amdgpu_ras_mgr.h"
29 #include "amdgpu_ras_cmd.h"
30 #include "amdgpu_virt_ras_cmd.h"
31 #include "amdgpu_ras_process.h"
32 #include "amdgpu_ras_eeprom_i2c.h"
33 #include "amdgpu_ras_mp1_v13_0.h"
34 #include "amdgpu_ras_nbio_v7_9.h"
35 
36 #define MAX_SOCKET_NUM_PER_HIVE		8
37 #define MAX_AID_NUM_PER_SOCKET		4
38 #define MAX_XCD_NUM_PER_AID			2
39 
40 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
41 #define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M)
42 
43 #define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4)
44 
45 /* Reserve 8 physical dram row for possible retirement.
46  * In worst cases, it will lose 8 * 2MB memory in vram domain
47  */
48 #define RAS_RESERVED_VRAM_SIZE_DEFAULT	(16ULL << 20)
49 
50 
ras_mgr_init_event_mgr(struct ras_event_manager * mgr)51 static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr)
52 {
53 	struct ras_event_state *event_state;
54 	int i;
55 
56 	memset(mgr, 0, sizeof(*mgr));
57 	atomic64_set(&mgr->seqno, 0);
58 
59 	for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
60 		event_state = &mgr->event_state[i];
61 		event_state->last_seqno = RAS_EVENT_INVALID_ID;
62 		atomic64_set(&event_state->count, 0);
63 	}
64 }
65 
amdgpu_ras_mgr_init_event_mgr(struct ras_core_context * ras_core)66 static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core)
67 {
68 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
69 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
70 	struct ras_event_manager *event_mgr;
71 	struct amdgpu_hive_info *hive;
72 
73 	hive = amdgpu_get_xgmi_hive(adev);
74 	event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
75 
76 	/* init event manager with node 0 on xgmi system */
77 	if (!amdgpu_reset_in_recovery(adev)) {
78 		if (!hive || adev->gmc.xgmi.node_id == 0)
79 			ras_mgr_init_event_mgr(event_mgr);
80 	}
81 
82 	if (hive)
83 		amdgpu_put_xgmi_hive(hive);
84 }
85 
amdgpu_ras_mgr_init_aca_config(struct amdgpu_device * adev,struct ras_core_config * config)86 static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev,
87 		struct ras_core_config *config)
88 {
89 	struct ras_aca_config *aca_cfg = &config->aca_cfg;
90 
91 	aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE;
92 	aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET;
93 	aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID;
94 
95 	return 0;
96 }
97 
amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device * adev,struct ras_core_config * config)98 static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev,
99 		struct ras_core_config *config)
100 {
101 	struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg;
102 
103 	eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func;
104 	eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus;
105 	if (eeprom_cfg->eeprom_i2c_adapter) {
106 		const struct i2c_adapter_quirks *quirks =
107 			((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks;
108 
109 		if (quirks) {
110 			eeprom_cfg->max_i2c_read_len = quirks->max_read_len;
111 			eeprom_cfg->max_i2c_write_len = quirks->max_write_len;
112 		}
113 	}
114 
115 	/*
116 	 * amdgpu_bad_page_threshold is used to config
117 	 * the threshold for the number of bad pages.
118 	 * -1:  Threshold is set to default value
119 	 *      Driver will issue a warning message when threshold is reached
120 	 *      and continue runtime services.
121 	 * 0:   Disable bad page retirement
122 	 *      Driver will not retire bad pages
123 	 *      which is intended for debugging purpose.
124 	 * -2:  Threshold is determined by a formula
125 	 *      that assumes 1 bad page per 100M of local memory.
126 	 *      Driver will continue runtime services when threhold is reached.
127 	 * 0 < threshold < max number of bad page records in EEPROM,
128 	 *      A user-defined threshold is set
129 	 *      Driver will halt runtime services when this custom threshold is reached.
130 	 */
131 	if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD)
132 		eeprom_cfg->eeprom_record_threshold_count =
133 			div64_u64(adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE);
134 	else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD)
135 		eeprom_cfg->eeprom_record_threshold_count =
136 				COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT);
137 	else
138 		eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold;
139 
140 	eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold;
141 
142 	return 0;
143 }
144 
amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device * adev,struct ras_core_config * config)145 static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev,
146 		struct ras_core_config *config)
147 {
148 	struct ras_mp1_config *mp1_cfg = &config->mp1_cfg;
149 	int ret = 0;
150 
151 	switch (config->mp1_ip_version) {
152 	case IP_VERSION(13, 0, 6):
153 	case IP_VERSION(13, 0, 14):
154 	case IP_VERSION(13, 0, 12):
155 		mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0;
156 		break;
157 	default:
158 		RAS_DEV_ERR(adev,
159 			"The mp1(0x%x) ras config is not right!\n",
160 			config->mp1_ip_version);
161 		ret = -EINVAL;
162 		break;
163 	}
164 
165 	return ret;
166 }
167 
amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device * adev,struct ras_core_config * config)168 static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev,
169 		struct ras_core_config *config)
170 {
171 	struct ras_nbio_config *nbio_cfg = &config->nbio_cfg;
172 	int ret = 0;
173 
174 	switch (config->nbio_ip_version) {
175 	case IP_VERSION(7, 9, 0):
176 	case IP_VERSION(7, 9, 1):
177 		nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9;
178 		break;
179 	default:
180 		RAS_DEV_ERR(adev,
181 			"The nbio(0x%x) ras config is not right!\n",
182 			config->nbio_ip_version);
183 		ret = -EINVAL;
184 		break;
185 	}
186 
187 	return ret;
188 }
189 
amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context * ras_core,struct ras_psp_sys_status * status)190 static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core,
191 			struct ras_psp_sys_status *status)
192 {
193 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
194 	struct ta_context *context = &adev->psp.ras_context.context;
195 
196 	status->initialized = context->initialized;
197 	status->session_id = context->session_id;
198 	status->psp_cmd_mutex = &adev->psp.mutex;
199 
200 	return 0;
201 }
202 
amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context * ras_core,struct ras_ta_init_param * ras_ta_param)203 static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core,
204 	struct ras_ta_init_param *ras_ta_param)
205 {
206 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
207 	uint32_t nps_mode;
208 
209 	if (amdgpu_ras_is_poison_mode_supported(adev))
210 		ras_ta_param->poison_mode_en = 1;
211 
212 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
213 		ras_ta_param->dgpu_mode = 1;
214 
215 	ras_ta_param->xcc_mask = adev->gfx.xcc_mask;
216 	ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2;
217 
218 	ras_ta_param->active_umc_mask = adev->umc.active_mask;
219 	ras_ta_param->vram_type = (uint8_t)adev->gmc.vram_type;
220 
221 	if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, &nps_mode))
222 		ras_ta_param->nps_mode = nps_mode;
223 
224 	return 0;
225 }
226 
227 const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = {
228 	.get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status,
229 	.get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param,
230 };
231 
amdgpu_ras_mgr_init_psp_config(struct amdgpu_device * adev,struct ras_core_config * config)232 static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev,
233 	struct ras_core_config *config)
234 {
235 	struct ras_psp_config *psp_cfg = &config->psp_cfg;
236 
237 	psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func;
238 
239 	return 0;
240 }
241 
amdgpu_ras_mgr_init_umc_config(struct amdgpu_device * adev,struct ras_core_config * config)242 static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev,
243 	struct ras_core_config *config)
244 {
245 	struct ras_umc_config *umc_cfg = &config->umc_cfg;
246 
247 	umc_cfg->umc_vram_type = adev->gmc.vram_type;
248 
249 	return 0;
250 }
251 
amdgpu_ras_mgr_create_ras_core(struct amdgpu_device * adev)252 static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev)
253 {
254 	struct ras_core_config init_config;
255 
256 	memset(&init_config, 0, sizeof(init_config));
257 
258 	init_config.umc_ip_version = amdgpu_ip_version(adev, UMC_HWIP, 0);
259 	init_config.mp1_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
260 	init_config.gfx_ip_version = amdgpu_ip_version(adev, GC_HWIP, 0);
261 	init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0);
262 	init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
263 
264 	if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) ||
265 	    init_config.umc_ip_version == IP_VERSION(12, 5, 0))
266 		init_config.aca_ip_version = IP_VERSION(1, 0, 0);
267 
268 	init_config.sys_fn = &amdgpu_ras_sys_fn;
269 	init_config.ras_eeprom_supported = true;
270 	init_config.poison_supported =
271 		amdgpu_ras_is_poison_mode_supported(adev);
272 
273 	amdgpu_ras_mgr_init_aca_config(adev, &init_config);
274 	amdgpu_ras_mgr_init_eeprom_config(adev, &init_config);
275 	amdgpu_ras_mgr_init_mp1_config(adev, &init_config);
276 	amdgpu_ras_mgr_init_nbio_config(adev, &init_config);
277 	amdgpu_ras_mgr_init_psp_config(adev, &init_config);
278 	amdgpu_ras_mgr_init_umc_config(adev, &init_config);
279 
280 	return ras_core_create(&init_config);
281 }
282 
amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block * ip_block)283 static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block)
284 {
285 	struct amdgpu_device *adev = ip_block->adev;
286 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
287 	struct amdgpu_ras_mgr *ras_mgr;
288 	int ret = 0;
289 
290 	/* Disabled by default */
291 	con->uniras_enabled = false;
292 
293 	/* Enabled only in debug mode */
294 	if (adev->debug_enable_ras_aca) {
295 		con->uniras_enabled = true;
296 		RAS_DEV_INFO(adev, "Debug amdgpu uniras!");
297 	}
298 
299 	if (!con->uniras_enabled)
300 		return 0;
301 
302 	ras_mgr = kzalloc_obj(*ras_mgr);
303 	if (!ras_mgr)
304 		return -EINVAL;
305 
306 	con->ras_mgr = ras_mgr;
307 	ras_mgr->adev = adev;
308 
309 	ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev);
310 	if (!ras_mgr->ras_core) {
311 		RAS_DEV_ERR(adev, "Failed to create ras core!\n");
312 		ret = -EINVAL;
313 		goto err;
314 	}
315 
316 	ras_mgr->ras_core->dev = adev;
317 
318 	amdgpu_ras_process_init(adev);
319 	ras_core_sw_init(ras_mgr->ras_core);
320 	amdgpu_ras_mgr_init_event_mgr(ras_mgr->ras_core);
321 
322 	if (amdgpu_sriov_vf(adev)) {
323 		ret = amdgpu_virt_ras_sw_init(adev);
324 		if (ret) {
325 			RAS_DEV_ERR(adev,
326 				"Virt ras sw_init failed! ret:%d\n", ret);
327 			goto err;
328 		}
329 	}
330 
331 	return 0;
332 
333 err:
334 	kfree(ras_mgr);
335 	return ret;
336 }
337 
amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block * ip_block)338 static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block)
339 {
340 	struct amdgpu_device *adev = ip_block->adev;
341 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
342 	struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr;
343 
344 	if (!con->uniras_enabled)
345 		return 0;
346 
347 	if (!ras_mgr)
348 		return 0;
349 
350 	if (amdgpu_sriov_vf(adev))
351 		amdgpu_virt_ras_sw_fini(adev);
352 
353 	amdgpu_ras_process_fini(adev);
354 	ras_core_sw_fini(ras_mgr->ras_core);
355 	ras_core_destroy(ras_mgr->ras_core);
356 	ras_mgr->ras_core = NULL;
357 
358 	kfree(con->ras_mgr);
359 	con->ras_mgr = NULL;
360 
361 	return 0;
362 }
363 
amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block * ip_block)364 static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block)
365 {
366 	struct amdgpu_device *adev = ip_block->adev;
367 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
368 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
369 	int ret;
370 
371 	if (!con->uniras_enabled)
372 		return 0;
373 
374 	if (!ras_mgr || !ras_mgr->ras_core)
375 		return -EINVAL;
376 
377 	if (amdgpu_sriov_vf(adev))
378 		ret = amdgpu_virt_ras_hw_init(adev);
379 	else
380 		ret = ras_core_hw_init(ras_mgr->ras_core);
381 
382 	if (ret) {
383 		RAS_DEV_ERR(adev, "Failed to initialize hw_init!, ret:%d\n", ret);
384 		return ret;
385 	}
386 
387 	ras_mgr->ras_is_ready = true;
388 
389 	amdgpu_enable_uniras(adev, true);
390 
391 	RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n");
392 	return 0;
393 }
394 
amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block * ip_block)395 static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block)
396 {
397 	struct amdgpu_device *adev = ip_block->adev;
398 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
399 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
400 
401 	if (!con->uniras_enabled)
402 		return 0;
403 
404 	if (!ras_mgr || !ras_mgr->ras_core)
405 		return -EINVAL;
406 
407 	if (amdgpu_sriov_vf(adev))
408 		amdgpu_virt_ras_hw_fini(adev);
409 	else
410 		ras_core_hw_fini(ras_mgr->ras_core);
411 
412 	ras_mgr->ras_is_ready = false;
413 
414 	return 0;
415 }
416 
amdgpu_ras_mgr_get_context(struct amdgpu_device * adev)417 struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev)
418 {
419 	if (!adev || !adev->psp.ras_context.ras)
420 		return NULL;
421 
422 	return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr;
423 }
424 
425 static const struct amd_ip_funcs __maybe_unused ras_v1_0_ip_funcs = {
426 	.name = "ras_v1_0",
427 	.sw_init = amdgpu_ras_mgr_sw_init,
428 	.sw_fini = amdgpu_ras_mgr_sw_fini,
429 	.hw_init = amdgpu_ras_mgr_hw_init,
430 	.hw_fini = amdgpu_ras_mgr_hw_fini,
431 };
432 
433 const struct amdgpu_ip_block_version ras_v1_0_ip_block = {
434 	.type = AMD_IP_BLOCK_TYPE_RAS,
435 	.major = 1,
436 	.minor = 0,
437 	.rev = 0,
438 	.funcs = &ras_v1_0_ip_funcs,
439 };
440 
amdgpu_enable_uniras(struct amdgpu_device * adev,bool enable)441 int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable)
442 {
443 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
444 
445 	if (!ras_mgr || !ras_mgr->ras_core)
446 		return -EPERM;
447 
448 	RAS_DEV_INFO(adev, "Enable amdgpu unified ras!");
449 	return ras_core_set_status(ras_mgr->ras_core, enable);
450 }
451 
amdgpu_uniras_enabled(struct amdgpu_device * adev)452 bool amdgpu_uniras_enabled(struct amdgpu_device *adev)
453 {
454 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
455 
456 	if (amdgpu_sriov_vf(adev))
457 		return amdgpu_virt_ras_remote_uniras_enabled(adev);
458 
459 	if (!ras_mgr || !ras_mgr->ras_core)
460 		return false;
461 
462 	return ras_core_is_enabled(ras_mgr->ras_core);
463 }
464 
amdgpu_ras_mgr_is_ready(struct amdgpu_device * adev)465 static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev)
466 {
467 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
468 
469 	if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready &&
470 	    ras_core_is_ready(ras_mgr->ras_core))
471 		return true;
472 
473 	return false;
474 }
475 
amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device * adev,void * data)476 int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data)
477 {
478 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
479 
480 	if (!amdgpu_ras_mgr_is_ready(adev))
481 		return -EPERM;
482 
483 	return ras_core_handle_nbio_irq(ras_mgr->ras_core, data);
484 }
485 
amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device * adev,enum ras_seqno_type seqno_type)486 uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
487 			enum ras_seqno_type seqno_type)
488 {
489 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
490 	int ret;
491 	uint64_t seq_no;
492 
493 	if (!amdgpu_ras_mgr_is_ready(adev) ||
494 	    (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
495 		return 0;
496 
497 	seq_no = ras_core_gen_seqno(ras_mgr->ras_core, seqno_type);
498 
499 	if ((seqno_type == RAS_SEQNO_TYPE_DE) ||
500 	    (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) {
501 		ret = ras_core_put_seqno(ras_mgr->ras_core, seqno_type, seq_no);
502 		if (ret)
503 			RAS_DEV_WARN(adev, "There are too many ras interrupts!");
504 	}
505 
506 	return seq_no;
507 }
508 
amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device * adev,void * data)509 int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data)
510 {
511 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
512 	struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
513 	uint64_t seq_no = 0;
514 	int ret = 0;
515 
516 	if (!amdgpu_ras_mgr_is_ready(adev))
517 		return -EPERM;
518 
519 	if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) {
520 		if (ras_mgr->ras_core->poison_supported) {
521 			seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_DE);
522 			RAS_DEV_INFO(adev,
523 				"{%llu} RAS poison is created, no user action is needed.\n",
524 				seq_no);
525 		}
526 
527 		ret = amdgpu_ras_process_handle_umc_interrupt(adev, ih_info);
528 	} else if (ras_mgr->ras_core->poison_supported) {
529 		ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, ih_info);
530 	} else {
531 		RAS_DEV_WARN(adev,
532 			"No RAS interrupt handler for non-UMC block with poison disabled.\n");
533 	}
534 
535 	return ret;
536 }
537 
amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device * adev,void * data)538 int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data)
539 {
540 	if (!amdgpu_ras_mgr_is_ready(adev))
541 		return -EPERM;
542 
543 	return amdgpu_ras_process_handle_consumption_interrupt(adev, data);
544 }
545 
amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device * adev)546 int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev)
547 {
548 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
549 
550 	if (!amdgpu_ras_mgr_is_ready(adev))
551 		return -EPERM;
552 
553 	return ras_core_update_ecc_info(ras_mgr->ras_core);
554 }
555 
amdgpu_ras_mgr_reset_gpu(struct amdgpu_device * adev,uint32_t flags)556 int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags)
557 {
558 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
559 
560 	if (!amdgpu_ras_mgr_is_ready(adev))
561 		return -EPERM;
562 
563 	con->gpu_reset_flags |= flags;
564 	return amdgpu_ras_reset_gpu(adev);
565 }
566 
amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device * adev)567 bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev)
568 {
569 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
570 
571 	if (!amdgpu_ras_mgr_is_ready(adev))
572 		return false;
573 
574 	return ras_eeprom_check_safety_watermark(ras_mgr->ras_core);
575 }
576 
amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device * adev,uint32_t * nps_mode)577 int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev,
578 	uint32_t *nps_mode)
579 {
580 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
581 	uint32_t mode;
582 
583 	if (!amdgpu_ras_mgr_is_ready(adev))
584 		return -EINVAL;
585 
586 	mode = ras_core_get_curr_nps_mode(ras_mgr->ras_core);
587 	if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE)
588 		return -EINVAL;
589 
590 	*nps_mode = mode;
591 
592 	return 0;
593 }
594 
amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device * adev,uint64_t addr)595 bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
596 			uint64_t addr)
597 {
598 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
599 
600 	if (!amdgpu_ras_mgr_is_ready(adev))
601 		return false;
602 
603 	return ras_umc_check_retired_addr(ras_mgr->ras_core, addr);
604 }
605 
amdgpu_ras_mgr_is_rma(struct amdgpu_device * adev)606 bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev)
607 {
608 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
609 
610 	if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready)
611 		return false;
612 
613 	return ras_core_gpu_is_rma(ras_mgr->ras_core);
614 }
615 
amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device * adev,uint32_t cmd_id,void * input,uint32_t input_size,void * output,uint32_t out_size)616 int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
617 			uint32_t cmd_id, void *input, uint32_t input_size,
618 			void *output, uint32_t out_size)
619 {
620 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
621 	struct ras_cmd_ctx *cmd_ctx;
622 	uint32_t ctx_buf_size = PAGE_SIZE;
623 	int ret;
624 
625 	if (!amdgpu_sriov_vf(adev) && !amdgpu_ras_mgr_is_ready(adev))
626 		return -EPERM;
627 
628 	cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL);
629 	if (!cmd_ctx)
630 		return -ENOMEM;
631 
632 	cmd_ctx->cmd_id = cmd_id;
633 
634 	memcpy(cmd_ctx->input_buff_raw, input, input_size);
635 	cmd_ctx->input_size = input_size;
636 	cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx);
637 
638 	ret = amdgpu_ras_submit_cmd(ras_mgr->ras_core, cmd_ctx);
639 	if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size))
640 		memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size);
641 
642 	kfree(cmd_ctx);
643 
644 	return ret;
645 }
646 
amdgpu_ras_mgr_pre_reset(struct amdgpu_device * adev)647 int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
648 {
649 	if (amdgpu_sriov_vf(adev))
650 		return amdgpu_virt_ras_pre_reset(adev);
651 
652 	if (!amdgpu_ras_mgr_is_ready(adev)) {
653 		RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
654 		return -EPERM;
655 	}
656 
657 	amdgpu_ras_process_pre_reset(adev);
658 	return 0;
659 }
660 
amdgpu_ras_mgr_post_reset(struct amdgpu_device * adev)661 int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
662 {
663 	if (amdgpu_sriov_vf(adev))
664 		return amdgpu_virt_ras_post_reset(adev);
665 
666 	if (!amdgpu_ras_mgr_is_ready(adev)) {
667 		RAS_DEV_ERR(adev, "Invalid ras resume!\n");
668 		return -EPERM;
669 	}
670 
671 	amdgpu_ras_process_post_reset(adev);
672 	return 0;
673 }
674 
amdgpu_ras_mgr_lookup_bad_pages_in_a_row(struct amdgpu_device * adev,uint64_t addr,uint64_t * nps_page_addr,uint32_t max_page_count)675 int amdgpu_ras_mgr_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
676 		uint64_t addr, uint64_t *nps_page_addr, uint32_t max_page_count)
677 {
678 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
679 
680 	if (!amdgpu_ras_mgr_is_ready(adev))
681 		return -EPERM;
682 
683 	if (!nps_page_addr || !max_page_count)
684 		return -EINVAL;
685 
686 	return ras_core_convert_soc_pa_to_cur_nps_pages(ras_mgr->ras_core,
687 			addr, nps_page_addr, max_page_count);
688 }
689