xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c (revision 8b9eac5e0faebaffc5411505e0df0d00dc09504c)
1 /*
2  * Copyright 2019 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #include <linux/sort.h>
25 #include "amdgpu.h"
26 #include "umc_v6_7.h"
27 #define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms
28 
29 #define MAX_UMC_HASH_STRING_SIZE  256
30 
31 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
32 				    struct ras_err_data *err_data, uint64_t err_addr,
33 				    uint32_t ch_inst, uint32_t umc_inst)
34 {
35 	switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
36 	case IP_VERSION(6, 7, 0):
37 		umc_v6_7_convert_error_address(adev,
38 				err_data, err_addr, ch_inst, umc_inst);
39 		break;
40 	default:
41 		dev_warn(adev->dev,
42 			 "UMC address to Physical address translation is not supported\n");
43 		return AMDGPU_RAS_FAIL;
44 	}
45 
46 	return AMDGPU_RAS_SUCCESS;
47 }
48 
49 int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
50 			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
51 {
52 	struct ras_err_data err_data;
53 	int ret;
54 
55 	ret = amdgpu_ras_error_data_init(&err_data);
56 	if (ret)
57 		return ret;
58 
59 	err_data.err_addr =
60 		kcalloc(adev->umc.max_ras_err_cnt_per_query,
61 			sizeof(struct eeprom_table_record), GFP_KERNEL);
62 	if (!err_data.err_addr) {
63 		dev_warn(adev->dev,
64 			"Failed to alloc memory for umc error record in MCA notifier!\n");
65 		ret = AMDGPU_RAS_FAIL;
66 		goto out_fini_err_data;
67 	}
68 
69 	err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
70 
71 	/*
72 	 * Translate UMC channel address to Physical address
73 	 */
74 	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
75 					ch_inst, umc_inst);
76 	if (ret)
77 		goto out_free_err_addr;
78 
79 	if (amdgpu_bad_page_threshold != 0) {
80 		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
81 						err_data.err_addr_cnt, false);
82 		amdgpu_ras_save_bad_pages(adev, NULL);
83 	}
84 
85 out_free_err_addr:
86 	kfree(err_data.err_addr);
87 
88 out_fini_err_data:
89 	amdgpu_ras_error_data_fini(&err_data);
90 
91 	return ret;
92 }
93 
94 void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
95 			void *ras_error_status)
96 {
97 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
98 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
99 	struct amdgpu_ras_eeprom_control *control = &con->eeprom_control;
100 	unsigned int error_query_mode;
101 	int ret = 0;
102 	unsigned long err_count;
103 
104 	amdgpu_ras_get_error_query_mode(adev, &error_query_mode);
105 
106 	err_data->err_addr =
107 		kcalloc(adev->umc.max_ras_err_cnt_per_query,
108 			sizeof(struct eeprom_table_record), GFP_KERNEL);
109 
110 	/* still call query_ras_error_address to clear error status
111 	 * even NOMEM error is encountered
112 	 */
113 	if (!err_data->err_addr)
114 		dev_warn(adev->dev,
115 			"Failed to alloc memory for umc error address record!\n");
116 	else
117 		err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
118 
119 	mutex_lock(&con->page_retirement_lock);
120 	if (!amdgpu_ras_smu_eeprom_supported(adev)) {
121 		ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
122 		if (ret == -EOPNOTSUPP &&
123 		    error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
124 			if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
125 			    adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
126 				adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev,
127 								ras_error_status);
128 
129 			if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
130 			    adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
131 			    adev->umc.max_ras_err_cnt_per_query) {
132 				err_data->err_addr =
133 					kcalloc(adev->umc.max_ras_err_cnt_per_query,
134 						sizeof(struct eeprom_table_record), GFP_KERNEL);
135 
136 				/* still call query_ras_error_address to clear error status
137 				 * even NOMEM error is encountered
138 				 */
139 				if (!err_data->err_addr)
140 					dev_warn(adev->dev,
141 						"Failed to alloc memory for umc error address record!\n");
142 				else
143 					err_data->err_addr_len =
144 						adev->umc.max_ras_err_cnt_per_query;
145 
146 				/* umc query_ras_error_address is also responsible for clearing
147 				 * error status
148 				 */
149 				adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev,
150 								ras_error_status);
151 			}
152 		} else if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY ||
153 		    (!ret && error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY)) {
154 			if (adev->umc.ras &&
155 			    adev->umc.ras->ecc_info_query_ras_error_count)
156 				adev->umc.ras->ecc_info_query_ras_error_count(adev,
157 								ras_error_status);
158 
159 			if (adev->umc.ras &&
160 			    adev->umc.ras->ecc_info_query_ras_error_address &&
161 			    adev->umc.max_ras_err_cnt_per_query) {
162 				err_data->err_addr =
163 					kcalloc(adev->umc.max_ras_err_cnt_per_query,
164 						sizeof(struct eeprom_table_record), GFP_KERNEL);
165 
166 				/* still call query_ras_error_address to clear error status
167 				 * even NOMEM error is encountered
168 				 */
169 				if (!err_data->err_addr)
170 					dev_warn(adev->dev,
171 						"Failed to alloc memory for umc error address record!\n");
172 				else
173 					err_data->err_addr_len =
174 						adev->umc.max_ras_err_cnt_per_query;
175 
176 				/* umc query_ras_error_address is also responsible for clearing
177 				 * error status
178 				 */
179 				adev->umc.ras->ecc_info_query_ras_error_address(adev,
180 								ras_error_status);
181 			}
182 		}
183 	} else {
184 		if (!amdgpu_ras_eeprom_update_record_num(control)) {
185 			err_data->err_addr_cnt = err_data->de_count =
186 				control->ras_num_recs -	control->ras_num_recs_old;
187 			amdgpu_ras_eeprom_read_idx(control, err_data->err_addr,
188 				control->ras_num_recs_old, err_data->de_count);
189 		}
190 	}
191 
192 	/* only uncorrectable error needs gpu reset */
193 	if (err_data->ue_count || err_data->de_count) {
194 		err_count = err_data->ue_count + err_data->de_count;
195 		if ((amdgpu_bad_page_threshold != 0) &&
196 			err_data->err_addr_cnt) {
197 			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
198 				err_data->err_addr_cnt, amdgpu_ras_smu_eeprom_supported(adev));
199 			amdgpu_ras_save_bad_pages(adev, &err_count);
200 
201 			amdgpu_dpm_send_hbm_bad_pages_num(adev,
202 					con->eeprom_control.ras_num_bad_pages);
203 
204 			if (con->update_channel_flag == true) {
205 				amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
206 				con->update_channel_flag = false;
207 			}
208 		}
209 	}
210 
211 	kfree(err_data->err_addr);
212 	err_data->err_addr = NULL;
213 
214 	mutex_unlock(&con->page_retirement_lock);
215 }
216 
217 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
218 		void *ras_error_status,
219 		struct amdgpu_iv_entry *entry,
220 		uint32_t reset)
221 {
222 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
223 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
224 
225 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
226 	amdgpu_umc_handle_bad_pages(adev, ras_error_status);
227 
228 	if ((err_data->ue_count || err_data->de_count) &&
229 	    (reset || amdgpu_ras_is_rma(adev))) {
230 		con->gpu_reset_flags |= reset;
231 		amdgpu_ras_reset_gpu(adev);
232 	}
233 
234 	return AMDGPU_RAS_SUCCESS;
235 }
236 
237 int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
238 			enum amdgpu_ras_block block, uint16_t pasid,
239 			pasid_notify pasid_fn, void *data, uint32_t reset)
240 {
241 	int ret = AMDGPU_RAS_SUCCESS;
242 
243 	if (adev->gmc.xgmi.connected_to_cpu ||
244 		adev->gmc.is_app_apu) {
245 		if (reset) {
246 			/* MCA poison handler is only responsible for GPU reset,
247 			 * let MCA notifier do page retirement.
248 			 */
249 			kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
250 			amdgpu_ras_reset_gpu(adev);
251 		}
252 		return ret;
253 	}
254 
255 	if (!amdgpu_sriov_vf(adev)) {
256 		if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
257 			struct ras_err_data err_data;
258 			struct ras_common_if head = {
259 				.block = AMDGPU_RAS_BLOCK__UMC,
260 			};
261 			struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
262 
263 			ret = amdgpu_ras_error_data_init(&err_data);
264 			if (ret)
265 				return ret;
266 
267 			ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
268 
269 			if (ret == AMDGPU_RAS_SUCCESS && obj) {
270 				obj->err_data.ue_count += err_data.ue_count;
271 				obj->err_data.ce_count += err_data.ce_count;
272 				obj->err_data.de_count += err_data.de_count;
273 			}
274 
275 			amdgpu_ras_error_data_fini(&err_data);
276 		} else {
277 			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
278 			int ret;
279 
280 			ret = amdgpu_ras_put_poison_req(adev,
281 				block, pasid, pasid_fn, data, reset);
282 			if (!ret) {
283 				atomic_inc(&con->page_retirement_req_cnt);
284 				atomic_inc(&con->poison_consumption_count);
285 				wake_up(&con->page_retirement_wq);
286 			}
287 		}
288 	} else {
289 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
290 			adev->virt.ops->ras_poison_handler(adev, block);
291 		else
292 			dev_warn(adev->dev,
293 				"No ras_poison_handler interface in SRIOV!\n");
294 	}
295 
296 	return ret;
297 }
298 
299 int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
300 			enum amdgpu_ras_block block, uint32_t reset)
301 {
302 	return amdgpu_umc_pasid_poison_handler(adev,
303 				block, 0, NULL, NULL, reset);
304 }
305 
306 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
307 		void *ras_error_status,
308 		struct amdgpu_iv_entry *entry)
309 {
310 	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
311 				AMDGPU_RAS_GPU_RESET_MODE1_RESET);
312 }
313 
314 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
315 {
316 	int err;
317 	struct amdgpu_umc_ras *ras;
318 
319 	if (!adev->umc.ras)
320 		return 0;
321 
322 	ras = adev->umc.ras;
323 
324 	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
325 	if (err) {
326 		dev_err(adev->dev, "Failed to register umc ras block!\n");
327 		return err;
328 	}
329 
330 	strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
331 	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
332 	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
333 	adev->umc.ras_if = &ras->ras_block.ras_comm;
334 
335 	if (!ras->ras_block.ras_late_init)
336 		ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
337 
338 	if (!ras->ras_block.ras_cb)
339 		ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
340 
341 	return 0;
342 }
343 
344 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
345 {
346 	int r;
347 
348 	r = amdgpu_ras_block_late_init(adev, ras_block);
349 	if (r)
350 		return r;
351 
352 	if (amdgpu_sriov_vf(adev))
353 		return r;
354 
355 	if (amdgpu_ras_is_supported(adev, ras_block->block)) {
356 		r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
357 		if (r)
358 			goto late_fini;
359 	}
360 
361 	/* ras init of specific umc version */
362 	if (adev->umc.ras &&
363 	    adev->umc.ras->err_cnt_init)
364 		adev->umc.ras->err_cnt_init(adev);
365 
366 	return 0;
367 
368 late_fini:
369 	amdgpu_ras_block_late_fini(adev, ras_block);
370 	return r;
371 }
372 
373 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
374 		struct amdgpu_irq_src *source,
375 		struct amdgpu_iv_entry *entry)
376 {
377 	struct ras_common_if *ras_if = adev->umc.ras_if;
378 	struct ras_dispatch_if ih_data = {
379 		.entry = entry,
380 	};
381 
382 	if (!ras_if)
383 		return 0;
384 
385 	ih_data.head = *ras_if;
386 
387 	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
388 	return 0;
389 }
390 
391 int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
392 		uint64_t err_addr,
393 		uint64_t retired_page,
394 		uint32_t channel_index,
395 		uint32_t umc_inst)
396 {
397 	struct eeprom_table_record *err_rec;
398 
399 	if (!err_data ||
400 	    !err_data->err_addr ||
401 	    (err_data->err_addr_cnt >= err_data->err_addr_len))
402 		return -EINVAL;
403 
404 	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
405 
406 	err_rec->address = err_addr;
407 	/* page frame address is saved */
408 	err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
409 	err_rec->ts = (uint64_t)ktime_get_real_seconds();
410 	err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
411 	err_rec->cu = 0;
412 	err_rec->mem_channel = channel_index;
413 	err_rec->mcumc_id = umc_inst;
414 
415 	err_data->err_addr_cnt++;
416 
417 	return 0;
418 }
419 
420 static int amdgpu_umc_loop_all_aid(struct amdgpu_device *adev, umc_func func,
421 				   void *data)
422 {
423 	uint32_t umc_node_inst;
424 	uint32_t node_inst;
425 	uint32_t umc_inst;
426 	uint32_t ch_inst;
427 	int ret;
428 
429 	/*
430 	 * This loop is done based on the following -
431 	 * umc.active mask = mask of active umc instances across all nodes
432 	 * umc.umc_inst_num = maximum number of umc instancess per node
433 	 * umc.node_inst_num = maximum number of node instances
434 	 * Channel instances are not assumed to be harvested.
435 	 */
436 	dev_dbg(adev->dev, "active umcs :%lx umc_inst per node: %d",
437 		adev->umc.active_mask, adev->umc.umc_inst_num);
438 	for_each_set_bit(umc_node_inst, &(adev->umc.active_mask),
439 			 adev->umc.node_inst_num * adev->umc.umc_inst_num) {
440 		node_inst = umc_node_inst / adev->umc.umc_inst_num;
441 		umc_inst = umc_node_inst % adev->umc.umc_inst_num;
442 		LOOP_UMC_CH_INST(ch_inst) {
443 			dev_dbg(adev->dev,
444 				"node_inst :%d umc_inst: %d ch_inst: %d",
445 				node_inst, umc_inst, ch_inst);
446 			ret = func(adev, node_inst, umc_inst, ch_inst, data);
447 			if (ret) {
448 				dev_err(adev->dev,
449 					"Node %d umc %d ch %d func returns %d\n",
450 					node_inst, umc_inst, ch_inst, ret);
451 				return ret;
452 			}
453 		}
454 	}
455 
456 	return 0;
457 }
458 
459 int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
460 			umc_func func, void *data)
461 {
462 	uint32_t node_inst       = 0;
463 	uint32_t umc_inst        = 0;
464 	uint32_t ch_inst         = 0;
465 	int ret = 0;
466 
467 	if (adev->aid_mask)
468 		return amdgpu_umc_loop_all_aid(adev, func, data);
469 
470 	if (adev->umc.node_inst_num) {
471 		LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
472 			ret = func(adev, node_inst, umc_inst, ch_inst, data);
473 			if (ret) {
474 				dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n",
475 					node_inst, umc_inst, ch_inst, ret);
476 				return ret;
477 			}
478 		}
479 	} else {
480 		LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
481 			ret = func(adev, 0, umc_inst, ch_inst, data);
482 			if (ret) {
483 				dev_err(adev->dev, "Umc %d ch %d func returns %d\n",
484 					umc_inst, ch_inst, ret);
485 				return ret;
486 			}
487 		}
488 	}
489 
490 	return 0;
491 }
492 
493 int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
494 				uint64_t status, uint64_t ipid, uint64_t addr)
495 {
496 	if (adev->umc.ras->update_ecc_status)
497 		return adev->umc.ras->update_ecc_status(adev,
498 					status, ipid, addr);
499 	return 0;
500 }
501 
502 int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
503 		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
504 {
505 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
506 	struct ras_ecc_log_info *ecc_log;
507 	int ret;
508 
509 	ecc_log = &con->umc_ecc_log;
510 
511 	mutex_lock(&ecc_log->lock);
512 	ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err);
513 	if (!ret)
514 		radix_tree_tag_set(ecc_tree,
515 			ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
516 	mutex_unlock(&ecc_log->lock);
517 
518 	return ret;
519 }
520 
521 int amdgpu_umc_pages_in_a_row(struct amdgpu_device *adev,
522 			struct ras_err_data *err_data, uint64_t pa_addr)
523 {
524 	struct ta_ras_query_address_output addr_out;
525 
526 	/* reinit err_data */
527 	err_data->err_addr_cnt = 0;
528 	err_data->err_addr_len = adev->umc.retire_unit;
529 
530 	addr_out.pa.pa = pa_addr;
531 	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
532 		return adev->umc.ras->convert_ras_err_addr(adev, err_data, NULL,
533 				&addr_out, false);
534 	else
535 		return -EINVAL;
536 }
537 
538 int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
539 			uint64_t pa_addr, uint64_t *pfns, int len)
540 {
541 	int i, ret;
542 	struct ras_err_data err_data;
543 
544 	err_data.err_addr = kcalloc(adev->umc.retire_unit,
545 				sizeof(struct eeprom_table_record), GFP_KERNEL);
546 	if (!err_data.err_addr) {
547 		dev_warn(adev->dev, "Failed to alloc memory in bad page lookup!\n");
548 		return 0;
549 	}
550 
551 	ret = amdgpu_umc_pages_in_a_row(adev, &err_data, pa_addr);
552 	if (ret)
553 		goto out;
554 
555 	for (i = 0; i < adev->umc.retire_unit; i++) {
556 		if (i >= len)
557 			goto out;
558 
559 		pfns[i] = err_data.err_addr[i].retired_page;
560 	}
561 	ret = i;
562 	adev->umc.err_addr_cnt = err_data.err_addr_cnt;
563 
564 out:
565 	kfree(err_data.err_addr);
566 	return ret;
567 }
568 
569 int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
570 			uint64_t err_addr, uint32_t ch, uint32_t umc,
571 			uint32_t node, uint32_t socket,
572 			struct ta_ras_query_address_output *addr_out, bool dump_addr)
573 {
574 	struct ta_ras_query_address_input addr_in;
575 	int ret;
576 
577 	memset(&addr_in, 0, sizeof(addr_in));
578 	addr_in.ma.err_addr = err_addr;
579 	addr_in.ma.ch_inst = ch;
580 	addr_in.ma.umc_inst = umc;
581 	addr_in.ma.node_inst = node;
582 	addr_in.ma.socket_id = socket;
583 
584 	if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
585 		ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
586 				addr_out, dump_addr);
587 		if (ret)
588 			return ret;
589 	} else {
590 		return 0;
591 	}
592 
593 	return 0;
594 }
595 
596 int amdgpu_umc_pa2mca(struct amdgpu_device *adev,
597 		uint64_t pa, uint64_t *mca, enum amdgpu_memory_partition nps)
598 {
599 	struct ta_ras_query_address_input addr_in;
600 	struct ta_ras_query_address_output addr_out;
601 	int ret;
602 
603 	/* nps: the pa belongs to */
604 	addr_in.pa.pa = pa | ((uint64_t)nps << 58);
605 	addr_in.addr_type = TA_RAS_PA_TO_MCA;
606 	ret = psp_ras_query_address(&adev->psp, &addr_in, &addr_out);
607 	if (ret) {
608 		dev_warn(adev->dev, "Failed to query RAS MCA address for 0x%llx",
609 			pa);
610 
611 		return ret;
612 	}
613 
614 	*mca = addr_out.ma.err_addr;
615 
616 	return 0;
617 }
618