xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c (revision 3fd6c59042dbba50391e30862beac979491145fe)
1  /*
2   * Copyright 2021 Advanced Micro Devices, Inc.
3   *
4   * Permission is hereby granted, free of charge, to any person obtaining a
5   * copy of this software and associated documentation files (the "Software"),
6   * to deal in the Software without restriction, including without limitation
7   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8   * and/or sell copies of the Software, and to permit persons to whom the
9   * Software is furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17   * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18   * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19   * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20   * OTHER DEALINGS IN THE SOFTWARE.
21   *
22   */
23  #include "amdgpu_ras.h"
24  #include "amdgpu.h"
25  #include "amdgpu_mca.h"
26  
27  #include "umc/umc_6_7_0_offset.h"
28  #include "umc/umc_6_7_0_sh_mask.h"
29  
amdgpu_mca_is_deferred_error(struct amdgpu_device * adev,uint64_t mc_status)30  static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev,
31  					uint64_t mc_status)
32  {
33  	if (adev->umc.ras->check_ecc_err_status)
34  		return adev->umc.ras->check_ecc_err_status(adev,
35  				AMDGPU_MCA_ERROR_TYPE_DE, &mc_status);
36  
37  	return false;
38  }
39  
amdgpu_mca_query_correctable_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,unsigned long * error_count)40  void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
41  					      uint64_t mc_status_addr,
42  					      unsigned long *error_count)
43  {
44  	uint64_t mc_status = RREG64_PCIE(mc_status_addr);
45  
46  	if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
47  	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
48  		*error_count += 1;
49  }
50  
amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,unsigned long * error_count)51  void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
52  						uint64_t mc_status_addr,
53  						unsigned long *error_count)
54  {
55  	uint64_t mc_status = RREG64_PCIE(mc_status_addr);
56  
57  	if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
58  	    (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
59  	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
60  	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
61  	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
62  	    REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
63  		*error_count += 1;
64  }
65  
amdgpu_mca_reset_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr)66  void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
67  				  uint64_t mc_status_addr)
68  {
69  	WREG64_PCIE(mc_status_addr, 0x0ULL);
70  }
71  
amdgpu_mca_query_ras_error_count(struct amdgpu_device * adev,uint64_t mc_status_addr,void * ras_error_status)72  void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
73  				      uint64_t mc_status_addr,
74  				      void *ras_error_status)
75  {
76  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
77  
78  	amdgpu_mca_query_correctable_error_count(adev, mc_status_addr, &(err_data->ce_count));
79  	amdgpu_mca_query_uncorrectable_error_count(adev, mc_status_addr, &(err_data->ue_count));
80  
81  	amdgpu_mca_reset_error_count(adev, mc_status_addr);
82  }
83  
amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device * adev)84  int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev)
85  {
86  	int err;
87  	struct amdgpu_mca_ras_block *ras;
88  
89  	if (!adev->mca.mp0.ras)
90  		return 0;
91  
92  	ras = adev->mca.mp0.ras;
93  
94  	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
95  	if (err) {
96  		dev_err(adev->dev, "Failed to register mca.mp0 ras block!\n");
97  		return err;
98  	}
99  
100  	strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
101  	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
102  	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
103  	adev->mca.mp0.ras_if = &ras->ras_block.ras_comm;
104  
105  	return 0;
106  }
107  
amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device * adev)108  int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev)
109  {
110  	int err;
111  	struct amdgpu_mca_ras_block *ras;
112  
113  	if (!adev->mca.mp1.ras)
114  		return 0;
115  
116  	ras = adev->mca.mp1.ras;
117  
118  	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
119  	if (err) {
120  		dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
121  		return err;
122  	}
123  
124  	strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
125  	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
126  	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
127  	adev->mca.mp1.ras_if = &ras->ras_block.ras_comm;
128  
129  	return 0;
130  }
131  
amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device * adev)132  int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
133  {
134  	int err;
135  	struct amdgpu_mca_ras_block *ras;
136  
137  	if (!adev->mca.mpio.ras)
138  		return 0;
139  
140  	ras = adev->mca.mpio.ras;
141  
142  	err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
143  	if (err) {
144  		dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
145  		return err;
146  	}
147  
148  	strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
149  	ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
150  	ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
151  	adev->mca.mpio.ras_if = &ras->ras_block.ras_comm;
152  
153  	return 0;
154  }
155  
amdgpu_mca_bank_set_init(struct mca_bank_set * mca_set)156  static void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set)
157  {
158  	if (!mca_set)
159  		return;
160  
161  	memset(mca_set, 0, sizeof(*mca_set));
162  	INIT_LIST_HEAD(&mca_set->list);
163  }
164  
amdgpu_mca_bank_set_add_entry(struct mca_bank_set * mca_set,struct mca_bank_entry * entry)165  static int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry)
166  {
167  	struct mca_bank_node *node;
168  
169  	if (!entry)
170  		return -EINVAL;
171  
172  	node = kvzalloc(sizeof(*node), GFP_KERNEL);
173  	if (!node)
174  		return -ENOMEM;
175  
176  	memcpy(&node->entry, entry, sizeof(*entry));
177  
178  	INIT_LIST_HEAD(&node->node);
179  	list_add_tail(&node->node, &mca_set->list);
180  
181  	mca_set->nr_entries++;
182  
183  	return 0;
184  }
185  
amdgpu_mca_bank_set_merge(struct mca_bank_set * mca_set,struct mca_bank_set * new)186  static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_bank_set *new)
187  {
188  	struct mca_bank_node *node;
189  
190  	list_for_each_entry(node, &new->list, node)
191  		amdgpu_mca_bank_set_add_entry(mca_set, &node->entry);
192  
193  	return 0;
194  }
195  
amdgpu_mca_bank_set_remove_node(struct mca_bank_set * mca_set,struct mca_bank_node * node)196  static void amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node)
197  {
198  	if (!node)
199  		return;
200  
201  	list_del(&node->node);
202  	kvfree(node);
203  
204  	mca_set->nr_entries--;
205  }
206  
amdgpu_mca_bank_set_release(struct mca_bank_set * mca_set)207  static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set)
208  {
209  	struct mca_bank_node *node, *tmp;
210  
211  	if (list_empty(&mca_set->list))
212  		return;
213  
214  	list_for_each_entry_safe(node, tmp, &mca_set->list, node)
215  		amdgpu_mca_bank_set_remove_node(mca_set, node);
216  }
217  
amdgpu_mca_smu_init_funcs(struct amdgpu_device * adev,const struct amdgpu_mca_smu_funcs * mca_funcs)218  void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs)
219  {
220  	struct amdgpu_mca *mca = &adev->mca;
221  
222  	mca->mca_funcs = mca_funcs;
223  }
224  
amdgpu_mca_init(struct amdgpu_device * adev)225  int amdgpu_mca_init(struct amdgpu_device *adev)
226  {
227  	struct amdgpu_mca *mca = &adev->mca;
228  	struct mca_bank_cache *mca_cache;
229  	int i;
230  
231  	atomic_set(&mca->ue_update_flag, 0);
232  
233  	for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
234  		mca_cache = &mca->mca_caches[i];
235  		mutex_init(&mca_cache->lock);
236  		amdgpu_mca_bank_set_init(&mca_cache->mca_set);
237  	}
238  
239  	return 0;
240  }
241  
amdgpu_mca_fini(struct amdgpu_device * adev)242  void amdgpu_mca_fini(struct amdgpu_device *adev)
243  {
244  	struct amdgpu_mca *mca = &adev->mca;
245  	struct mca_bank_cache *mca_cache;
246  	int i;
247  
248  	atomic_set(&mca->ue_update_flag, 0);
249  
250  	for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
251  		mca_cache = &mca->mca_caches[i];
252  		amdgpu_mca_bank_set_release(&mca_cache->mca_set);
253  		mutex_destroy(&mca_cache->lock);
254  	}
255  }
256  
amdgpu_mca_reset(struct amdgpu_device * adev)257  int amdgpu_mca_reset(struct amdgpu_device *adev)
258  {
259  	amdgpu_mca_fini(adev);
260  
261  	return amdgpu_mca_init(adev);
262  }
263  
amdgpu_mca_smu_set_debug_mode(struct amdgpu_device * adev,bool enable)264  int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
265  {
266  	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
267  
268  	if (mca_funcs && mca_funcs->mca_set_debug_mode)
269  		return mca_funcs->mca_set_debug_mode(adev, enable);
270  
271  	return -EOPNOTSUPP;
272  }
273  
amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device * adev,int idx,struct mca_bank_entry * entry,struct ras_query_context * qctx)274  static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
275  					 struct ras_query_context *qctx)
276  {
277  	u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID;
278  
279  	RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
280  	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
281  		      idx, entry->regs[MCA_REG_IDX_STATUS]);
282  	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
283  		      idx, entry->regs[MCA_REG_IDX_ADDR]);
284  	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
285  		      idx, entry->regs[MCA_REG_IDX_MISC0]);
286  	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
287  		      idx, entry->regs[MCA_REG_IDX_IPID]);
288  	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
289  		      idx, entry->regs[MCA_REG_IDX_SYND]);
290  }
291  
amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,uint32_t * count)292  static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count)
293  {
294  	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
295  
296  	if (!count)
297  		return -EINVAL;
298  
299  	if (mca_funcs && mca_funcs->mca_get_valid_mca_count)
300  		return mca_funcs->mca_get_valid_mca_count(adev, type, count);
301  
302  	return -EOPNOTSUPP;
303  }
304  
amdgpu_mca_smu_get_mca_entry(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,int idx,struct mca_bank_entry * entry)305  static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
306  					int idx, struct mca_bank_entry *entry)
307  {
308  	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
309  	int count;
310  
311  	if (!mca_funcs || !mca_funcs->mca_get_mca_entry)
312  		return -EOPNOTSUPP;
313  
314  	switch (type) {
315  	case AMDGPU_MCA_ERROR_TYPE_UE:
316  		count = mca_funcs->max_ue_count;
317  		break;
318  	case AMDGPU_MCA_ERROR_TYPE_CE:
319  		count = mca_funcs->max_ce_count;
320  		break;
321  	default:
322  		return -EINVAL;
323  	}
324  
325  	if (idx >= count)
326  		return -EINVAL;
327  
328  	return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
329  }
330  
amdgpu_mca_bank_should_update(struct amdgpu_device * adev,enum amdgpu_mca_error_type type)331  static bool amdgpu_mca_bank_should_update(struct amdgpu_device *adev, enum amdgpu_mca_error_type type)
332  {
333  	struct amdgpu_mca *mca = &adev->mca;
334  	bool ret = true;
335  
336  	/*
337  	 * Because the UE Valid MCA count will only be cleared after reset,
338  	 * in order to avoid repeated counting of the error count,
339  	 * the aca bank is only updated once during the gpu recovery stage.
340  	 */
341  	if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
342  		if (amdgpu_ras_intr_triggered())
343  			ret = atomic_cmpxchg(&mca->ue_update_flag, 0, 1) == 0;
344  		else
345  			atomic_set(&mca->ue_update_flag, 0);
346  	}
347  
348  	return ret;
349  }
350  
amdgpu_mca_bank_should_dump(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,struct mca_bank_entry * entry)351  static bool amdgpu_mca_bank_should_dump(struct amdgpu_device *adev, enum amdgpu_mca_error_type type,
352  					struct mca_bank_entry *entry)
353  {
354  	bool ret;
355  
356  	switch (type) {
357  	case AMDGPU_MCA_ERROR_TYPE_CE:
358  		ret = amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]);
359  		break;
360  	case AMDGPU_MCA_ERROR_TYPE_UE:
361  	default:
362  		ret = true;
363  		break;
364  	}
365  
366  	return ret;
367  }
368  
amdgpu_mca_smu_get_mca_set(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,struct mca_bank_set * mca_set,struct ras_query_context * qctx)369  static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set,
370  				      struct ras_query_context *qctx)
371  {
372  	struct mca_bank_entry entry;
373  	uint32_t count = 0, i;
374  	int ret;
375  
376  	if (!mca_set)
377  		return -EINVAL;
378  
379  	if (!amdgpu_mca_bank_should_update(adev, type))
380  		return 0;
381  
382  	ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count);
383  	if (ret)
384  		return ret;
385  
386  	for (i = 0; i < count; i++) {
387  		memset(&entry, 0, sizeof(entry));
388  		ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, &entry);
389  		if (ret)
390  			return ret;
391  
392  		amdgpu_mca_bank_set_add_entry(mca_set, &entry);
393  
394  		if (amdgpu_mca_bank_should_dump(adev, type, &entry))
395  			amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx);
396  	}
397  
398  	return 0;
399  }
400  
amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum amdgpu_mca_error_type type,struct mca_bank_entry * entry,uint32_t * count)401  static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
402  						enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count)
403  {
404  	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
405  
406  	if (!count || !entry)
407  		return -EINVAL;
408  
409  	if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count)
410  		return -EOPNOTSUPP;
411  
412  	return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count);
413  }
414  
amdgpu_mca_dispatch_mca_set(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum amdgpu_mca_error_type type,struct mca_bank_set * mca_set,struct ras_err_data * err_data)415  static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
416  				       struct mca_bank_set *mca_set, struct ras_err_data *err_data)
417  {
418  	struct amdgpu_smuio_mcm_config_info mcm_info;
419  	struct mca_bank_node *node, *tmp;
420  	struct mca_bank_entry *entry;
421  	uint32_t count;
422  	int ret;
423  
424  	if (!mca_set)
425  		return -EINVAL;
426  
427  	if (!mca_set->nr_entries)
428  		return 0;
429  
430  	list_for_each_entry_safe(node, tmp, &mca_set->list, node) {
431  		entry = &node->entry;
432  
433  		count = 0;
434  		ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
435  		if (ret && ret != -EOPNOTSUPP)
436  			return ret;
437  
438  		if (!count)
439  			continue;
440  
441  		memset(&mcm_info, 0, sizeof(mcm_info));
442  
443  		mcm_info.socket_id = entry->info.socket_id;
444  		mcm_info.die_id = entry->info.aid;
445  
446  		if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
447  			amdgpu_ras_error_statistic_ue_count(err_data,
448  							    &mcm_info, (uint64_t)count);
449  		} else {
450  			if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS]))
451  				amdgpu_ras_error_statistic_de_count(err_data,
452  								    &mcm_info, (uint64_t)count);
453  			else
454  				amdgpu_ras_error_statistic_ce_count(err_data,
455  								    &mcm_info, (uint64_t)count);
456  		}
457  
458  		amdgpu_mca_bank_set_remove_node(mca_set, node);
459  	}
460  
461  	return 0;
462  }
463  
amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device * adev,enum amdgpu_mca_error_type type,struct mca_bank_set * new)464  static int amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *new)
465  {
466  	struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
467  	int ret;
468  
469  	mutex_lock(&mca_cache->lock);
470  	ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new);
471  	mutex_unlock(&mca_cache->lock);
472  
473  	return ret;
474  }
475  
amdgpu_mca_smu_log_ras_error(struct amdgpu_device * adev,enum amdgpu_ras_block blk,enum amdgpu_mca_error_type type,struct ras_err_data * err_data,struct ras_query_context * qctx)476  int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
477  				 struct ras_err_data *err_data, struct ras_query_context *qctx)
478  {
479  	struct mca_bank_set mca_set;
480  	struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type];
481  	int ret;
482  
483  	amdgpu_mca_bank_set_init(&mca_set);
484  
485  	ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx);
486  	if (ret)
487  		goto out_mca_release;
488  
489  	ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data);
490  	if (ret)
491  		goto out_mca_release;
492  
493  	/* add remain mca bank to mca cache */
494  	if (mca_set.nr_entries) {
495  		ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
496  		if (ret)
497  			goto out_mca_release;
498  	}
499  
500  	/* dispatch mca set again if mca cache has valid data */
501  	mutex_lock(&mca_cache->lock);
502  	if (mca_cache->mca_set.nr_entries)
503  		ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_cache->mca_set, err_data);
504  	mutex_unlock(&mca_cache->lock);
505  
506  out_mca_release:
507  	amdgpu_mca_bank_set_release(&mca_set);
508  
509  	return ret;
510  }
511  
512  #if defined(CONFIG_DEBUG_FS)
amdgpu_mca_smu_debug_mode_set(void * data,u64 val)513  static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val)
514  {
515  	struct amdgpu_device *adev = (struct amdgpu_device *)data;
516  	int ret;
517  
518  	ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
519  	if (ret)
520  		return ret;
521  
522  	dev_info(adev->dev, "amdgpu set smu mca debug mode %s success\n", val ? "on" : "off");
523  
524  	return 0;
525  }
526  
mca_dump_entry(struct seq_file * m,struct mca_bank_entry * entry)527  static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry)
528  {
529  	int i, idx = entry->idx;
530  	int reg_idx_array[] = {
531  		MCA_REG_IDX_STATUS,
532  		MCA_REG_IDX_ADDR,
533  		MCA_REG_IDX_MISC0,
534  		MCA_REG_IDX_IPID,
535  		MCA_REG_IDX_SYND,
536  	};
537  
538  	seq_printf(m, "mca entry[%d].type: %s\n", idx, entry->type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE");
539  	seq_printf(m, "mca entry[%d].ip: %d\n", idx, entry->ip);
540  	seq_printf(m, "mca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
541  		   idx, entry->info.socket_id, entry->info.aid, entry->info.hwid, entry->info.mcatype);
542  
543  	for (i = 0; i < ARRAY_SIZE(reg_idx_array); i++)
544  		seq_printf(m, "mca entry[%d].regs[%d]: 0x%016llx\n", idx, reg_idx_array[i], entry->regs[reg_idx_array[i]]);
545  }
546  
mca_dump_show(struct seq_file * m,enum amdgpu_mca_error_type type)547  static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type)
548  {
549  	struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
550  	struct mca_bank_node *node;
551  	struct mca_bank_set mca_set;
552  	struct ras_query_context qctx;
553  	int ret;
554  
555  	amdgpu_mca_bank_set_init(&mca_set);
556  
557  	qctx.evid.event_id = RAS_EVENT_INVALID_ID;
558  	ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx);
559  	if (ret)
560  		goto err_free_mca_set;
561  
562  	seq_printf(m, "amdgpu smu %s valid mca count: %d\n",
563  		   type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries);
564  
565  	if (!mca_set.nr_entries)
566  		goto err_free_mca_set;
567  
568  	list_for_each_entry(node, &mca_set.list, node)
569  		mca_dump_entry(m, &node->entry);
570  
571  	/* add mca bank to mca bank cache */
572  	ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set);
573  
574  err_free_mca_set:
575  	amdgpu_mca_bank_set_release(&mca_set);
576  
577  	return ret;
578  }
579  
mca_dump_ce_show(struct seq_file * m,void * unused)580  static int mca_dump_ce_show(struct seq_file *m, void *unused)
581  {
582  	return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_CE);
583  }
584  
mca_dump_ce_open(struct inode * inode,struct file * file)585  static int mca_dump_ce_open(struct inode *inode, struct file *file)
586  {
587  	return single_open(file, mca_dump_ce_show, inode->i_private);
588  }
589  
590  static const struct file_operations mca_ce_dump_debug_fops = {
591  	.owner = THIS_MODULE,
592  	.open = mca_dump_ce_open,
593  	.read = seq_read,
594  	.llseek = seq_lseek,
595  	.release = single_release,
596  };
597  
mca_dump_ue_show(struct seq_file * m,void * unused)598  static int mca_dump_ue_show(struct seq_file *m, void *unused)
599  {
600  	return mca_dump_show(m, AMDGPU_MCA_ERROR_TYPE_UE);
601  }
602  
mca_dump_ue_open(struct inode * inode,struct file * file)603  static int mca_dump_ue_open(struct inode *inode, struct file *file)
604  {
605  	return single_open(file, mca_dump_ue_show, inode->i_private);
606  }
607  
608  static const struct file_operations mca_ue_dump_debug_fops = {
609  	.owner = THIS_MODULE,
610  	.open = mca_dump_ue_open,
611  	.read = seq_read,
612  	.llseek = seq_lseek,
613  	.release = single_release,
614  };
615  
616  DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_set, "%llu\n");
617  #endif
618  
amdgpu_mca_smu_debugfs_init(struct amdgpu_device * adev,struct dentry * root)619  void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root)
620  {
621  #if defined(CONFIG_DEBUG_FS)
622  	if (!root)
623  		return;
624  
625  	debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops);
626  	debugfs_create_file("mca_ue_dump", 0400, root, adev, &mca_ue_dump_debug_fops);
627  	debugfs_create_file("mca_ce_dump", 0400, root, adev, &mca_ce_dump_debug_fops);
628  #endif
629  }
630  
631