xref: /linux/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c (revision 24f171c7e145f43b9f187578e89b0982ce87e54c)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include "amdgpu.h"
25 #include "amdgpu_reset.h"
26 #include "amdgpu_xgmi.h"
27 #include "ras_sys.h"
28 #include "amdgpu_ras_mgr.h"
29 #include "amdgpu_ras_cmd.h"
30 #include "amdgpu_ras_process.h"
31 #include "amdgpu_ras_eeprom_i2c.h"
32 #include "amdgpu_ras_mp1_v13_0.h"
33 #include "amdgpu_ras_nbio_v7_9.h"
34 
35 #define MAX_SOCKET_NUM_PER_HIVE		8
36 #define MAX_AID_NUM_PER_SOCKET		4
37 #define MAX_XCD_NUM_PER_AID			2
38 
39 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
40 #define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M)
41 
42 #define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4)
43 
44 /* Reserve 8 physical dram row for possible retirement.
45  * In worst cases, it will lose 8 * 2MB memory in vram domain
46  */
47 #define RAS_RESERVED_VRAM_SIZE_DEFAULT	(16ULL << 20)
48 
49 
50 static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr)
51 {
52 	struct ras_event_state *event_state;
53 	int i;
54 
55 	memset(mgr, 0, sizeof(*mgr));
56 	atomic64_set(&mgr->seqno, 0);
57 
58 	for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
59 		event_state = &mgr->event_state[i];
60 		event_state->last_seqno = RAS_EVENT_INVALID_ID;
61 		atomic64_set(&event_state->count, 0);
62 	}
63 }
64 
65 static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core)
66 {
67 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
68 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
69 	struct ras_event_manager *event_mgr;
70 	struct amdgpu_hive_info *hive;
71 
72 	hive = amdgpu_get_xgmi_hive(adev);
73 	event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
74 
75 	/* init event manager with node 0 on xgmi system */
76 	if (!amdgpu_reset_in_recovery(adev)) {
77 		if (!hive || adev->gmc.xgmi.node_id == 0)
78 			ras_mgr_init_event_mgr(event_mgr);
79 	}
80 
81 	if (hive)
82 		amdgpu_put_xgmi_hive(hive);
83 }
84 
85 static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev,
86 		struct ras_core_config *config)
87 {
88 	struct ras_aca_config *aca_cfg = &config->aca_cfg;
89 
90 	aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE;
91 	aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET;
92 	aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID;
93 
94 	return 0;
95 }
96 
97 static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev,
98 		struct ras_core_config *config)
99 {
100 	struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg;
101 
102 	eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func;
103 	eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus;
104 	if (eeprom_cfg->eeprom_i2c_adapter) {
105 		const struct i2c_adapter_quirks *quirks =
106 			((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks;
107 
108 		if (quirks) {
109 			eeprom_cfg->max_i2c_read_len = quirks->max_read_len;
110 			eeprom_cfg->max_i2c_write_len = quirks->max_write_len;
111 		}
112 	}
113 
114 	/*
115 	 * amdgpu_bad_page_threshold is used to config
116 	 * the threshold for the number of bad pages.
117 	 * -1:  Threshold is set to default value
118 	 *      Driver will issue a warning message when threshold is reached
119 	 *      and continue runtime services.
120 	 * 0:   Disable bad page retirement
121 	 *      Driver will not retire bad pages
122 	 *      which is intended for debugging purpose.
123 	 * -2:  Threshold is determined by a formula
124 	 *      that assumes 1 bad page per 100M of local memory.
125 	 *      Driver will continue runtime services when threhold is reached.
126 	 * 0 < threshold < max number of bad page records in EEPROM,
127 	 *      A user-defined threshold is set
128 	 *      Driver will halt runtime services when this custom threshold is reached.
129 	 */
130 	if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD)
131 		eeprom_cfg->eeprom_record_threshold_count =
132 			div64_u64(adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE);
133 	else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD)
134 		eeprom_cfg->eeprom_record_threshold_count =
135 				COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT);
136 	else
137 		eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold;
138 
139 	eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold;
140 
141 	return 0;
142 }
143 
144 static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev,
145 		struct ras_core_config *config)
146 {
147 	struct ras_mp1_config *mp1_cfg = &config->mp1_cfg;
148 	int ret = 0;
149 
150 	switch (config->mp1_ip_version) {
151 	case IP_VERSION(13, 0, 6):
152 	case IP_VERSION(13, 0, 14):
153 	case IP_VERSION(13, 0, 12):
154 		mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0;
155 		break;
156 	default:
157 		RAS_DEV_ERR(adev,
158 			"The mp1(0x%x) ras config is not right!\n",
159 			config->mp1_ip_version);
160 		ret = -EINVAL;
161 		break;
162 	}
163 
164 	return ret;
165 }
166 
167 static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev,
168 		struct ras_core_config *config)
169 {
170 	struct ras_nbio_config *nbio_cfg = &config->nbio_cfg;
171 	int ret = 0;
172 
173 	switch (config->nbio_ip_version) {
174 	case IP_VERSION(7, 9, 0):
175 	case IP_VERSION(7, 9, 1):
176 		nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9;
177 		break;
178 	default:
179 		RAS_DEV_ERR(adev,
180 			"The nbio(0x%x) ras config is not right!\n",
181 			config->nbio_ip_version);
182 		ret = -EINVAL;
183 		break;
184 	}
185 
186 	return ret;
187 }
188 
189 static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core,
190 			struct ras_psp_sys_status *status)
191 {
192 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
193 	struct ta_context *context = &adev->psp.ras_context.context;
194 
195 	status->initialized = context->initialized;
196 	status->session_id = context->session_id;
197 	status->psp_cmd_mutex = &adev->psp.mutex;
198 
199 	return 0;
200 }
201 
202 static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core,
203 	struct ras_ta_init_param *ras_ta_param)
204 {
205 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
206 	uint32_t nps_mode;
207 
208 	if (amdgpu_ras_is_poison_mode_supported(adev))
209 		ras_ta_param->poison_mode_en = 1;
210 
211 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
212 		ras_ta_param->dgpu_mode = 1;
213 
214 	ras_ta_param->xcc_mask = adev->gfx.xcc_mask;
215 	ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2;
216 
217 	ras_ta_param->active_umc_mask = adev->umc.active_mask;
218 
219 	if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, &nps_mode))
220 		ras_ta_param->nps_mode = nps_mode;
221 
222 	return 0;
223 }
224 
225 const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = {
226 	.get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status,
227 	.get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param,
228 };
229 
230 static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev,
231 	struct ras_core_config *config)
232 {
233 	struct ras_psp_config *psp_cfg = &config->psp_cfg;
234 
235 	psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func;
236 
237 	return 0;
238 }
239 
240 static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev,
241 	struct ras_core_config *config)
242 {
243 	struct ras_umc_config *umc_cfg = &config->umc_cfg;
244 
245 	umc_cfg->umc_vram_type = adev->gmc.vram_type;
246 
247 	return 0;
248 }
249 
250 static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev)
251 {
252 	struct ras_core_config init_config;
253 
254 	memset(&init_config, 0, sizeof(init_config));
255 
256 	init_config.umc_ip_version = amdgpu_ip_version(adev, UMC_HWIP, 0);
257 	init_config.mp1_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
258 	init_config.gfx_ip_version = amdgpu_ip_version(adev, GC_HWIP, 0);
259 	init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0);
260 	init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
261 
262 	if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) ||
263 	    init_config.umc_ip_version == IP_VERSION(12, 5, 0))
264 		init_config.aca_ip_version = IP_VERSION(1, 0, 0);
265 
266 	init_config.sys_fn = &amdgpu_ras_sys_fn;
267 	init_config.ras_eeprom_supported = true;
268 	init_config.poison_supported =
269 		amdgpu_ras_is_poison_mode_supported(adev);
270 
271 	amdgpu_ras_mgr_init_aca_config(adev, &init_config);
272 	amdgpu_ras_mgr_init_eeprom_config(adev, &init_config);
273 	amdgpu_ras_mgr_init_mp1_config(adev, &init_config);
274 	amdgpu_ras_mgr_init_nbio_config(adev, &init_config);
275 	amdgpu_ras_mgr_init_psp_config(adev, &init_config);
276 	amdgpu_ras_mgr_init_umc_config(adev, &init_config);
277 
278 	return ras_core_create(&init_config);
279 }
280 
281 static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block)
282 {
283 	struct amdgpu_device *adev = ip_block->adev;
284 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
285 	struct amdgpu_ras_mgr *ras_mgr;
286 	int ret = 0;
287 
288 	/* Disabled by default */
289 	con->uniras_enabled = false;
290 
291 	/* Enabled only in debug mode */
292 	if (adev->debug_enable_ras_aca) {
293 		con->uniras_enabled = true;
294 		RAS_DEV_INFO(adev, "Debug amdgpu uniras!");
295 	}
296 
297 	if (!con->uniras_enabled)
298 		return 0;
299 
300 	ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL);
301 	if (!ras_mgr)
302 		return -EINVAL;
303 
304 	con->ras_mgr = ras_mgr;
305 	ras_mgr->adev = adev;
306 
307 	ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev);
308 	if (!ras_mgr->ras_core) {
309 		RAS_DEV_ERR(adev, "Failed to create ras core!\n");
310 		ret = -EINVAL;
311 		goto err;
312 	}
313 
314 	ras_mgr->ras_core->dev = adev;
315 
316 	amdgpu_ras_process_init(adev);
317 	ras_core_sw_init(ras_mgr->ras_core);
318 	amdgpu_ras_mgr_init_event_mgr(ras_mgr->ras_core);
319 	return 0;
320 
321 err:
322 	kfree(ras_mgr);
323 	return ret;
324 }
325 
326 static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block)
327 {
328 	struct amdgpu_device *adev = ip_block->adev;
329 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
330 	struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr;
331 
332 	if (!con->uniras_enabled)
333 		return 0;
334 
335 	if (!ras_mgr)
336 		return 0;
337 
338 	amdgpu_ras_process_fini(adev);
339 	ras_core_sw_fini(ras_mgr->ras_core);
340 	ras_core_destroy(ras_mgr->ras_core);
341 	ras_mgr->ras_core = NULL;
342 
343 	kfree(con->ras_mgr);
344 	con->ras_mgr = NULL;
345 
346 	return 0;
347 }
348 
349 static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block)
350 {
351 	struct amdgpu_device *adev = ip_block->adev;
352 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
353 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
354 	int ret;
355 
356 	if (!con->uniras_enabled)
357 		return 0;
358 
359 	if (!ras_mgr || !ras_mgr->ras_core)
360 		return -EINVAL;
361 
362 	ret = ras_core_hw_init(ras_mgr->ras_core);
363 	if (ret) {
364 		RAS_DEV_ERR(adev, "Failed to initialize ras core!\n");
365 		return ret;
366 	}
367 
368 	ras_mgr->ras_is_ready = true;
369 
370 	amdgpu_enable_uniras(adev, true);
371 
372 	RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n");
373 	return 0;
374 }
375 
376 static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block)
377 {
378 	struct amdgpu_device *adev = ip_block->adev;
379 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
380 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
381 
382 	if (!con->uniras_enabled)
383 		return 0;
384 
385 	if (!ras_mgr || !ras_mgr->ras_core)
386 		return -EINVAL;
387 
388 	ras_core_hw_fini(ras_mgr->ras_core);
389 
390 	ras_mgr->ras_is_ready = false;
391 
392 	return 0;
393 }
394 
395 struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev)
396 {
397 	if (!adev || !adev->psp.ras_context.ras)
398 		return NULL;
399 
400 	return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr;
401 }
402 
403 static const struct amd_ip_funcs __maybe_unused ras_v1_0_ip_funcs = {
404 	.name = "ras_v1_0",
405 	.sw_init = amdgpu_ras_mgr_sw_init,
406 	.sw_fini = amdgpu_ras_mgr_sw_fini,
407 	.hw_init = amdgpu_ras_mgr_hw_init,
408 	.hw_fini = amdgpu_ras_mgr_hw_fini,
409 };
410 
411 const struct amdgpu_ip_block_version ras_v1_0_ip_block = {
412 	.type = AMD_IP_BLOCK_TYPE_RAS,
413 	.major = 1,
414 	.minor = 0,
415 	.rev = 0,
416 	.funcs = &ras_v1_0_ip_funcs,
417 };
418 
419 int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable)
420 {
421 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
422 
423 	if (!ras_mgr || !ras_mgr->ras_core)
424 		return -EPERM;
425 
426 	if (amdgpu_sriov_vf(adev))
427 		return -EPERM;
428 
429 	RAS_DEV_INFO(adev, "Enable amdgpu unified ras!");
430 	return ras_core_set_status(ras_mgr->ras_core, enable);
431 }
432 
433 bool amdgpu_uniras_enabled(struct amdgpu_device *adev)
434 {
435 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
436 
437 	if (!ras_mgr || !ras_mgr->ras_core)
438 		return false;
439 
440 	if (amdgpu_sriov_vf(adev))
441 		return false;
442 
443 	return ras_core_is_enabled(ras_mgr->ras_core);
444 }
445 
446 static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev)
447 {
448 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
449 
450 	if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready &&
451 	    ras_core_is_ready(ras_mgr->ras_core))
452 		return true;
453 
454 	return false;
455 }
456 
457 int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data)
458 {
459 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
460 
461 	if (!amdgpu_ras_mgr_is_ready(adev))
462 		return -EPERM;
463 
464 	return ras_core_handle_nbio_irq(ras_mgr->ras_core, data);
465 }
466 
467 uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
468 			enum ras_seqno_type seqno_type)
469 {
470 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
471 	int ret;
472 	uint64_t seq_no;
473 
474 	if (!amdgpu_ras_mgr_is_ready(adev) ||
475 	    (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
476 		return 0;
477 
478 	seq_no = ras_core_gen_seqno(ras_mgr->ras_core, seqno_type);
479 
480 	if ((seqno_type == RAS_SEQNO_TYPE_DE) ||
481 	    (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) {
482 		ret = ras_core_put_seqno(ras_mgr->ras_core, seqno_type, seq_no);
483 		if (ret)
484 			RAS_DEV_WARN(adev, "There are too many ras interrupts!");
485 	}
486 
487 	return seq_no;
488 }
489 
490 int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data)
491 {
492 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
493 	struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
494 	uint64_t seq_no = 0;
495 	int ret = 0;
496 
497 	if (!amdgpu_ras_mgr_is_ready(adev))
498 		return -EPERM;
499 
500 	if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) {
501 		if (ras_mgr->ras_core->poison_supported) {
502 			seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_DE);
503 			RAS_DEV_INFO(adev,
504 				"{%llu} RAS poison is created, no user action is needed.\n",
505 				seq_no);
506 		}
507 
508 		ret = amdgpu_ras_process_handle_umc_interrupt(adev, ih_info);
509 	} else if (ras_mgr->ras_core->poison_supported) {
510 		ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, ih_info);
511 	} else {
512 		RAS_DEV_WARN(adev,
513 			"No RAS interrupt handler for non-UMC block with poison disabled.\n");
514 	}
515 
516 	return ret;
517 }
518 
519 int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data)
520 {
521 	if (!amdgpu_ras_mgr_is_ready(adev))
522 		return -EPERM;
523 
524 	return amdgpu_ras_process_handle_consumption_interrupt(adev, data);
525 }
526 
527 int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev)
528 {
529 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
530 
531 	if (!amdgpu_ras_mgr_is_ready(adev))
532 		return -EPERM;
533 
534 	return ras_core_update_ecc_info(ras_mgr->ras_core);
535 }
536 
537 int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags)
538 {
539 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
540 
541 	if (!amdgpu_ras_mgr_is_ready(adev))
542 		return -EPERM;
543 
544 	con->gpu_reset_flags |= flags;
545 	return amdgpu_ras_reset_gpu(adev);
546 }
547 
548 bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev)
549 {
550 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
551 
552 	if (!amdgpu_ras_mgr_is_ready(adev))
553 		return false;
554 
555 	return ras_eeprom_check_safety_watermark(ras_mgr->ras_core);
556 }
557 
558 int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev,
559 	uint32_t *nps_mode)
560 {
561 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
562 	uint32_t mode;
563 
564 	if (!amdgpu_ras_mgr_is_ready(adev))
565 		return -EINVAL;
566 
567 	mode = ras_core_get_curr_nps_mode(ras_mgr->ras_core);
568 	if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE)
569 		return -EINVAL;
570 
571 	*nps_mode = mode;
572 
573 	return 0;
574 }
575 
576 bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
577 			uint64_t addr)
578 {
579 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
580 
581 	if (!amdgpu_ras_mgr_is_ready(adev))
582 		return false;
583 
584 	return ras_umc_check_retired_addr(ras_mgr->ras_core, addr);
585 }
586 
587 bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev)
588 {
589 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
590 
591 	if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready)
592 		return false;
593 
594 	return ras_core_gpu_is_rma(ras_mgr->ras_core);
595 }
596 
597 int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
598 			uint32_t cmd_id, void *input, uint32_t input_size,
599 			void *output, uint32_t out_size)
600 {
601 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
602 	struct ras_cmd_ctx *cmd_ctx;
603 	uint32_t ctx_buf_size = PAGE_SIZE;
604 	int ret;
605 
606 	if (!amdgpu_ras_mgr_is_ready(adev))
607 		return -EPERM;
608 
609 	cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL);
610 	if (!cmd_ctx)
611 		return -ENOMEM;
612 
613 	cmd_ctx->cmd_id = cmd_id;
614 
615 	memcpy(cmd_ctx->input_buff_raw, input, input_size);
616 	cmd_ctx->input_size = input_size;
617 	cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx);
618 
619 	ret = amdgpu_ras_submit_cmd(ras_mgr->ras_core, cmd_ctx);
620 	if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size))
621 		memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size);
622 
623 	kfree(cmd_ctx);
624 
625 	return ret;
626 }
627 
628 int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
629 {
630 	if (!amdgpu_ras_mgr_is_ready(adev)) {
631 		RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
632 		return -EPERM;
633 	}
634 
635 	amdgpu_ras_process_pre_reset(adev);
636 	return 0;
637 }
638 
639 int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
640 {
641 	if (!amdgpu_ras_mgr_is_ready(adev)) {
642 		RAS_DEV_ERR(adev, "Invalid ras resume!\n");
643 		return -EPERM;
644 	}
645 
646 	amdgpu_ras_process_post_reset(adev);
647 	return 0;
648 }
649