xref: /linux/drivers/gpu/drm/xe/xe_drm_ras.c (revision 0fc8f6200d2313278fbf4539bbab74677c685531)
1*b40db12bSRiana Tauro // SPDX-License-Identifier: MIT
2*b40db12bSRiana Tauro /*
3*b40db12bSRiana Tauro  * Copyright © 2026 Intel Corporation
4*b40db12bSRiana Tauro  */
5*b40db12bSRiana Tauro 
6*b40db12bSRiana Tauro #include <linux/bitmap.h>
7*b40db12bSRiana Tauro 
8*b40db12bSRiana Tauro #include <drm/drm_managed.h>
9*b40db12bSRiana Tauro #include <drm/drm_print.h>
10*b40db12bSRiana Tauro #include <drm/drm_ras.h>
11*b40db12bSRiana Tauro 
12*b40db12bSRiana Tauro #include "xe_device_types.h"
13*b40db12bSRiana Tauro #include "xe_drm_ras.h"
14*b40db12bSRiana Tauro 
15*b40db12bSRiana Tauro static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES;
16*b40db12bSRiana Tauro static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
17*b40db12bSRiana Tauro 
18*b40db12bSRiana Tauro static int hw_query_error_counter(struct xe_drm_ras_counter *info,
19*b40db12bSRiana Tauro 				  u32 error_id, const char **name, u32 *val)
20*b40db12bSRiana Tauro {
21*b40db12bSRiana Tauro 	if (!info || !info[error_id].name)
22*b40db12bSRiana Tauro 		return -ENOENT;
23*b40db12bSRiana Tauro 
24*b40db12bSRiana Tauro 	*name = info[error_id].name;
25*b40db12bSRiana Tauro 	*val = atomic_read(&info[error_id].counter);
26*b40db12bSRiana Tauro 
27*b40db12bSRiana Tauro 	return 0;
28*b40db12bSRiana Tauro }
29*b40db12bSRiana Tauro 
30*b40db12bSRiana Tauro static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id,
31*b40db12bSRiana Tauro 					     const char **name, u32 *val)
32*b40db12bSRiana Tauro {
33*b40db12bSRiana Tauro 	struct xe_device *xe = ep->priv;
34*b40db12bSRiana Tauro 	struct xe_drm_ras *ras = &xe->ras;
35*b40db12bSRiana Tauro 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
36*b40db12bSRiana Tauro 
37*b40db12bSRiana Tauro 	return hw_query_error_counter(info, error_id, name, val);
38*b40db12bSRiana Tauro }
39*b40db12bSRiana Tauro 
40*b40db12bSRiana Tauro static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id,
41*b40db12bSRiana Tauro 					   const char **name, u32 *val)
42*b40db12bSRiana Tauro {
43*b40db12bSRiana Tauro 	struct xe_device *xe = ep->priv;
44*b40db12bSRiana Tauro 	struct xe_drm_ras *ras = &xe->ras;
45*b40db12bSRiana Tauro 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
46*b40db12bSRiana Tauro 
47*b40db12bSRiana Tauro 	return hw_query_error_counter(info, error_id, name, val);
48*b40db12bSRiana Tauro }
49*b40db12bSRiana Tauro 
50*b40db12bSRiana Tauro static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe)
51*b40db12bSRiana Tauro {
52*b40db12bSRiana Tauro 	struct xe_drm_ras_counter *counter;
53*b40db12bSRiana Tauro 	int i;
54*b40db12bSRiana Tauro 
55*b40db12bSRiana Tauro 	counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
56*b40db12bSRiana Tauro 	if (!counter)
57*b40db12bSRiana Tauro 		return ERR_PTR(-ENOMEM);
58*b40db12bSRiana Tauro 
59*b40db12bSRiana Tauro 	for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) {
60*b40db12bSRiana Tauro 		if (!error_components[i])
61*b40db12bSRiana Tauro 			continue;
62*b40db12bSRiana Tauro 
63*b40db12bSRiana Tauro 		counter[i].name = error_components[i];
64*b40db12bSRiana Tauro 		atomic_set(&counter[i].counter, 0);
65*b40db12bSRiana Tauro 	}
66*b40db12bSRiana Tauro 
67*b40db12bSRiana Tauro 	return counter;
68*b40db12bSRiana Tauro }
69*b40db12bSRiana Tauro 
70*b40db12bSRiana Tauro static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
71*b40db12bSRiana Tauro 			      const enum drm_xe_ras_error_severity severity)
72*b40db12bSRiana Tauro {
73*b40db12bSRiana Tauro 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
74*b40db12bSRiana Tauro 	struct xe_drm_ras *ras = &xe->ras;
75*b40db12bSRiana Tauro 	const char *device_name;
76*b40db12bSRiana Tauro 
77*b40db12bSRiana Tauro 	device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
78*b40db12bSRiana Tauro 				pci_domain_nr(pdev->bus), pdev->bus->number,
79*b40db12bSRiana Tauro 				PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
80*b40db12bSRiana Tauro 
81*b40db12bSRiana Tauro 	if (!device_name)
82*b40db12bSRiana Tauro 		return -ENOMEM;
83*b40db12bSRiana Tauro 
84*b40db12bSRiana Tauro 	node->device_name = device_name;
85*b40db12bSRiana Tauro 	node->node_name = error_severity[severity];
86*b40db12bSRiana Tauro 	node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
87*b40db12bSRiana Tauro 	node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE;
88*b40db12bSRiana Tauro 	node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1;
89*b40db12bSRiana Tauro 	node->priv = xe;
90*b40db12bSRiana Tauro 
91*b40db12bSRiana Tauro 	ras->info[severity] = allocate_and_copy_counters(xe);
92*b40db12bSRiana Tauro 	if (IS_ERR(ras->info[severity]))
93*b40db12bSRiana Tauro 		return PTR_ERR(ras->info[severity]);
94*b40db12bSRiana Tauro 
95*b40db12bSRiana Tauro 	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
96*b40db12bSRiana Tauro 		node->query_error_counter = query_correctable_error_counter;
97*b40db12bSRiana Tauro 	else
98*b40db12bSRiana Tauro 		node->query_error_counter = query_uncorrectable_error_counter;
99*b40db12bSRiana Tauro 
100*b40db12bSRiana Tauro 	return 0;
101*b40db12bSRiana Tauro }
102*b40db12bSRiana Tauro 
103*b40db12bSRiana Tauro static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity)
104*b40db12bSRiana Tauro {
105*b40db12bSRiana Tauro 	struct drm_ras_node *node = &ras->node[severity];
106*b40db12bSRiana Tauro 
107*b40db12bSRiana Tauro 	kfree(ras->info[severity]);
108*b40db12bSRiana Tauro 	ras->info[severity] = NULL;
109*b40db12bSRiana Tauro 
110*b40db12bSRiana Tauro 	kfree(node->device_name);
111*b40db12bSRiana Tauro 	node->device_name = NULL;
112*b40db12bSRiana Tauro }
113*b40db12bSRiana Tauro 
114*b40db12bSRiana Tauro static int register_nodes(struct xe_device *xe)
115*b40db12bSRiana Tauro {
116*b40db12bSRiana Tauro 	struct xe_drm_ras *ras = &xe->ras;
117*b40db12bSRiana Tauro 	int i;
118*b40db12bSRiana Tauro 
119*b40db12bSRiana Tauro 	for_each_error_severity(i) {
120*b40db12bSRiana Tauro 		struct drm_ras_node *node = &ras->node[i];
121*b40db12bSRiana Tauro 		int ret;
122*b40db12bSRiana Tauro 
123*b40db12bSRiana Tauro 		ret = assign_node_params(xe, node, i);
124*b40db12bSRiana Tauro 		if (ret) {
125*b40db12bSRiana Tauro 			cleanup_node_param(ras, i);
126*b40db12bSRiana Tauro 			return ret;
127*b40db12bSRiana Tauro 		}
128*b40db12bSRiana Tauro 
129*b40db12bSRiana Tauro 		ret = drm_ras_node_register(node);
130*b40db12bSRiana Tauro 		if (ret) {
131*b40db12bSRiana Tauro 			cleanup_node_param(ras, i);
132*b40db12bSRiana Tauro 			return ret;
133*b40db12bSRiana Tauro 		}
134*b40db12bSRiana Tauro 	}
135*b40db12bSRiana Tauro 
136*b40db12bSRiana Tauro 	return 0;
137*b40db12bSRiana Tauro }
138*b40db12bSRiana Tauro 
139*b40db12bSRiana Tauro static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg)
140*b40db12bSRiana Tauro {
141*b40db12bSRiana Tauro 	struct xe_device *xe = arg;
142*b40db12bSRiana Tauro 	struct xe_drm_ras *ras = &xe->ras;
143*b40db12bSRiana Tauro 	int i;
144*b40db12bSRiana Tauro 
145*b40db12bSRiana Tauro 	for_each_error_severity(i) {
146*b40db12bSRiana Tauro 		struct drm_ras_node *node = &ras->node[i];
147*b40db12bSRiana Tauro 
148*b40db12bSRiana Tauro 		drm_ras_node_unregister(node);
149*b40db12bSRiana Tauro 		cleanup_node_param(ras, i);
150*b40db12bSRiana Tauro 	}
151*b40db12bSRiana Tauro }
152*b40db12bSRiana Tauro 
153*b40db12bSRiana Tauro /**
154*b40db12bSRiana Tauro  * xe_drm_ras_init() - Initialize DRM RAS
155*b40db12bSRiana Tauro  * @xe: xe device instance
156*b40db12bSRiana Tauro  *
157*b40db12bSRiana Tauro  * Allocate and register DRM RAS nodes per device
158*b40db12bSRiana Tauro  *
159*b40db12bSRiana Tauro  * Return: 0 on success, negative error code otherwise.
160*b40db12bSRiana Tauro  */
161*b40db12bSRiana Tauro int xe_drm_ras_init(struct xe_device *xe)
162*b40db12bSRiana Tauro {
163*b40db12bSRiana Tauro 	struct xe_drm_ras *ras = &xe->ras;
164*b40db12bSRiana Tauro 	struct drm_ras_node *node;
165*b40db12bSRiana Tauro 	int err;
166*b40db12bSRiana Tauro 
167*b40db12bSRiana Tauro 	node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL);
168*b40db12bSRiana Tauro 	if (!node)
169*b40db12bSRiana Tauro 		return -ENOMEM;
170*b40db12bSRiana Tauro 
171*b40db12bSRiana Tauro 	ras->node = node;
172*b40db12bSRiana Tauro 
173*b40db12bSRiana Tauro 	err = register_nodes(xe);
174*b40db12bSRiana Tauro 	if (err) {
175*b40db12bSRiana Tauro 		drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err));
176*b40db12bSRiana Tauro 		return err;
177*b40db12bSRiana Tauro 	}
178*b40db12bSRiana Tauro 
179*b40db12bSRiana Tauro 	err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe);
180*b40db12bSRiana Tauro 	if (err) {
181*b40db12bSRiana Tauro 		drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err));
182*b40db12bSRiana Tauro 		return err;
183*b40db12bSRiana Tauro 	}
184*b40db12bSRiana Tauro 
185*b40db12bSRiana Tauro 	return 0;
186*b40db12bSRiana Tauro }
187