xref: /linux/drivers/gpu/drm/xe/xe_drm_ras.c (revision b5fa84e805a61d3c1a741035ac793674833d3ca0)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2026 Intel Corporation
4  */
5 
6 #include <linux/bitmap.h>
7 
8 #include <drm/drm_managed.h>
9 #include <drm/drm_print.h>
10 #include <drm/drm_ras.h>
11 
12 #include "xe_device_types.h"
13 #include "xe_drm_ras.h"
14 
15 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES;
16 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
17 
18 static int hw_query_error_counter(struct xe_drm_ras_counter *info,
19 				  u32 error_id, const char **name, u32 *val)
20 {
21 	if (!info || !info[error_id].name)
22 		return -ENOENT;
23 
24 	*name = info[error_id].name;
25 	*val = atomic_read(&info[error_id].counter);
26 
27 	return 0;
28 }
29 
30 static int hw_clear_error_counter(struct xe_drm_ras_counter *info, u32 error_id)
31 {
32 	if (!info || !info[error_id].name)
33 		return -ENOENT;
34 
35 	atomic_set(&info[error_id].counter, 0);
36 
37 	return 0;
38 }
39 
40 static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id,
41 					     const char **name, u32 *val)
42 {
43 	struct xe_device *xe = ep->priv;
44 	struct xe_drm_ras *ras = &xe->ras;
45 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
46 
47 	return hw_query_error_counter(info, error_id, name, val);
48 }
49 
50 static int clear_uncorrectable_error_counter(struct drm_ras_node *node, u32 error_id)
51 {
52 	struct xe_device *xe = node->priv;
53 	struct xe_drm_ras *ras = &xe->ras;
54 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE];
55 
56 	return hw_clear_error_counter(info, error_id);
57 }
58 
59 static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id,
60 					   const char **name, u32 *val)
61 {
62 	struct xe_device *xe = ep->priv;
63 	struct xe_drm_ras *ras = &xe->ras;
64 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
65 
66 	return hw_query_error_counter(info, error_id, name, val);
67 }
68 
69 static int clear_correctable_error_counter(struct drm_ras_node *node, u32 error_id)
70 {
71 	struct xe_device *xe = node->priv;
72 	struct xe_drm_ras *ras = &xe->ras;
73 	struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE];
74 
75 	return hw_clear_error_counter(info, error_id);
76 }
77 
78 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe)
79 {
80 	struct xe_drm_ras_counter *counter;
81 	int i;
82 
83 	counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
84 	if (!counter)
85 		return ERR_PTR(-ENOMEM);
86 
87 	for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) {
88 		if (!error_components[i])
89 			continue;
90 
91 		counter[i].name = error_components[i];
92 		atomic_set(&counter[i].counter, 0);
93 	}
94 
95 	return counter;
96 }
97 
98 static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
99 			      const enum drm_xe_ras_error_severity severity)
100 {
101 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
102 	struct xe_drm_ras *ras = &xe->ras;
103 	const char *device_name;
104 
105 	device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
106 				pci_domain_nr(pdev->bus), pdev->bus->number,
107 				PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
108 
109 	if (!device_name)
110 		return -ENOMEM;
111 
112 	node->device_name = device_name;
113 	node->node_name = error_severity[severity];
114 	node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
115 	node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE;
116 	node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1;
117 	node->priv = xe;
118 
119 	ras->info[severity] = allocate_and_copy_counters(xe);
120 	if (IS_ERR(ras->info[severity]))
121 		return PTR_ERR(ras->info[severity]);
122 
123 	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) {
124 		node->query_error_counter = query_correctable_error_counter;
125 		node->clear_error_counter = clear_correctable_error_counter;
126 	} else {
127 		node->query_error_counter = query_uncorrectable_error_counter;
128 		node->clear_error_counter = clear_uncorrectable_error_counter;
129 	}
130 
131 	return 0;
132 }
133 
134 static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity)
135 {
136 	struct drm_ras_node *node = &ras->node[severity];
137 
138 	kfree(ras->info[severity]);
139 	ras->info[severity] = NULL;
140 
141 	kfree(node->device_name);
142 	node->device_name = NULL;
143 }
144 
145 static int register_nodes(struct xe_device *xe)
146 {
147 	struct xe_drm_ras *ras = &xe->ras;
148 	int i;
149 
150 	for_each_error_severity(i) {
151 		struct drm_ras_node *node = &ras->node[i];
152 		int ret;
153 
154 		ret = assign_node_params(xe, node, i);
155 		if (ret) {
156 			cleanup_node_param(ras, i);
157 			return ret;
158 		}
159 
160 		ret = drm_ras_node_register(node);
161 		if (ret) {
162 			cleanup_node_param(ras, i);
163 			return ret;
164 		}
165 	}
166 
167 	return 0;
168 }
169 
170 static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg)
171 {
172 	struct xe_device *xe = arg;
173 	struct xe_drm_ras *ras = &xe->ras;
174 	int i;
175 
176 	for_each_error_severity(i) {
177 		struct drm_ras_node *node = &ras->node[i];
178 
179 		drm_ras_node_unregister(node);
180 		cleanup_node_param(ras, i);
181 	}
182 }
183 
184 /**
185  * xe_drm_ras_init() - Initialize DRM RAS
186  * @xe: xe device instance
187  *
188  * Allocate and register DRM RAS nodes per device
189  *
190  * Return: 0 on success, negative error code otherwise.
191  */
192 int xe_drm_ras_init(struct xe_device *xe)
193 {
194 	struct xe_drm_ras *ras = &xe->ras;
195 	struct drm_ras_node *node;
196 	int err;
197 
198 	node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL);
199 	if (!node)
200 		return -ENOMEM;
201 
202 	ras->node = node;
203 
204 	err = register_nodes(xe);
205 	if (err) {
206 		drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err));
207 		return err;
208 	}
209 
210 	err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe);
211 	if (err) {
212 		drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err));
213 		return err;
214 	}
215 
216 	return 0;
217 }
218